2005-04-17 02:20:36 +04:00
/*
* INETPEER - A storage for permanent information about peers
*
* This source is covered by the GNU GPL , the same as all kernel sources .
*
* Authors : Andrey V . Savochkin < saw @ msu . ru >
*/
# include <linux/module.h>
# include <linux/types.h>
# include <linux/slab.h>
# include <linux/interrupt.h>
# include <linux/spinlock.h>
# include <linux/random.h>
# include <linux/timer.h>
# include <linux/time.h>
# include <linux/kernel.h>
# include <linux/mm.h>
# include <linux/net.h>
2005-08-16 09:18:02 +04:00
# include <net/ip.h>
2005-04-17 02:20:36 +04:00
# include <net/inetpeer.h>
2011-08-04 07:50:44 +04:00
# include <net/secure_seq.h>
2005-04-17 02:20:36 +04:00
/*
* Theory of operations .
* We keep one entry for each peer IP address . The nodes contains long - living
* information about the peer which doesn ' t depend on routes .
* At this moment this information consists only of ID field for the next
* outgoing IP packet . This field is incremented with each packet as encoded
* in inet_getid ( ) function ( include / net / inetpeer . h ) .
* At the moment of writing this notes identifier of IP packets is generated
* to be unpredictable using this code only for packets subjected
* ( actually or potentially ) to defragmentation . I . e . DF packets less than
* PMTU in size uses a constant ID and do not use this code ( see
* ip_select_ident ( ) in include / net / ip . h ) .
*
* Route cache entries hold references to our nodes .
* New cache entries get references via lookup by destination IP address in
* the avl tree . The reference is grabbed only when it ' s needed i . e . only
* when we try to output IP packet which needs an unpredictable ID ( see
* __ip_select_ident ( ) in net / ipv4 / route . c ) .
* Nodes are removed only when reference counter goes to 0.
* When it ' s happened the node may be removed when a sufficient amount of
* time has been passed since its last use . The less - recently - used entry can
* also be removed if the pool is overloaded i . e . if the total amount of
* entries is greater - or - equal than the threshold .
*
* Node pool is organised as an AVL tree .
* Such an implementation has been chosen not just for fun . It ' s a way to
* prevent easy and efficient DoS attacks by creating hash collisions . A huge
* amount of long living nodes in a single hash slot would significantly delay
* lookups performed with disabled BHs .
*
* Serialisation issues .
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
* 1. Nodes may appear in the tree only with the pool lock held .
* 2. Nodes may disappear from the tree only with the pool lock held
2005-04-17 02:20:36 +04:00
* AND reference count being 0.
2011-06-08 17:35:34 +04:00
* 3. Global variable peer_total is modified under the pool lock .
* 4. struct inet_peer fields modification :
2005-04-17 02:20:36 +04:00
* avl_left , avl_right , avl_parent , avl_height : pool lock
* refcnt : atomically against modifications on other CPU ;
* usually under some other lock to prevent node disappearing
2010-11-30 22:53:55 +03:00
* daddr : unchangeable
2010-06-16 08:52:13 +04:00
* ip_id_count : atomic value ( no lock needed )
2005-04-17 02:20:36 +04:00
*/
2006-12-07 07:33:20 +03:00
static struct kmem_cache * peer_cachep __read_mostly ;
2005-04-17 02:20:36 +04:00
# define node_height(x) x->avl_height
2010-06-14 23:35:21 +04:00
# define peer_avl_empty ((struct inet_peer *)&peer_fake_node)
2010-10-26 03:55:38 +04:00
# define peer_avl_empty_rcu ((struct inet_peer __rcu __force *)&peer_fake_node)
2010-06-14 23:35:21 +04:00
static const struct inet_peer peer_fake_node = {
2010-10-26 03:55:38 +04:00
. avl_left = peer_avl_empty_rcu ,
. avl_right = peer_avl_empty_rcu ,
2005-04-17 02:20:36 +04:00
. avl_height = 0
} ;
2010-06-14 23:35:21 +04:00
2010-11-30 23:12:23 +03:00
struct inet_peer_base {
2010-10-26 03:55:38 +04:00
struct inet_peer __rcu * root ;
2011-03-05 01:33:59 +03:00
seqlock_t lock ;
2010-06-14 23:35:21 +04:00
int total ;
2010-11-30 23:12:23 +03:00
} ;
static struct inet_peer_base v4_peers = {
2010-10-26 03:55:38 +04:00
. root = peer_avl_empty_rcu ,
2011-03-05 01:33:59 +03:00
. lock = __SEQLOCK_UNLOCKED ( v4_peers . lock ) ,
2010-06-14 23:35:21 +04:00
. total = 0 ,
} ;
2010-11-30 23:12:23 +03:00
static struct inet_peer_base v6_peers = {
. root = peer_avl_empty_rcu ,
2011-03-05 01:33:59 +03:00
. lock = __SEQLOCK_UNLOCKED ( v6_peers . lock ) ,
2010-11-30 23:12:23 +03:00
. total = 0 ,
} ;
2005-04-17 02:20:36 +04:00
# define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
/* Exported for sysctl_net_ipv4. */
2007-03-07 07:23:10 +03:00
int inet_peer_threshold __read_mostly = 65536 + 128 ; /* start to throw entries more
2005-04-17 02:20:36 +04:00
* aggressively at this stage */
2007-03-07 07:23:10 +03:00
int inet_peer_minttl __read_mostly = 120 * HZ ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ ; /* usual time to live: 10 min */
2005-04-17 02:20:36 +04:00
/* Called from ip_output.c:ip_init */
void __init inet_initpeers ( void )
{
struct sysinfo si ;
/* Use the straight interface to information about memory. */
si_meminfo ( & si ) ;
/* The values below were suggested by Alexey Kuznetsov
* < kuznet @ ms2 . inr . ac . ru > . I don ' t have any opinion about the values
* myself . - - SAW
*/
if ( si . totalram < = ( 32768 * 1024 ) / PAGE_SIZE )
inet_peer_threshold > > = 1 ; /* max pool size about 1MB on IA32 */
if ( si . totalram < = ( 16384 * 1024 ) / PAGE_SIZE )
inet_peer_threshold > > = 1 ; /* about 512KB */
if ( si . totalram < = ( 8192 * 1024 ) / PAGE_SIZE )
inet_peer_threshold > > = 2 ; /* about 128KB */
peer_cachep = kmem_cache_create ( " inet_peer_cache " ,
sizeof ( struct inet_peer ) ,
2010-06-16 08:52:13 +04:00
0 , SLAB_HWCACHE_ALIGN | SLAB_PANIC ,
2007-07-20 05:11:58 +04:00
NULL ) ;
2005-04-17 02:20:36 +04:00
}
2010-12-02 04:28:18 +03:00
static int addr_compare ( const struct inetpeer_addr * a ,
const struct inetpeer_addr * b )
2010-11-30 23:08:53 +03:00
{
int i , n = ( a - > family = = AF_INET ? 1 : 4 ) ;
for ( i = 0 ; i < n ; i + + ) {
2011-02-10 01:30:26 +03:00
if ( a - > addr . a6 [ i ] = = b - > addr . a6 [ i ] )
2010-11-30 23:08:53 +03:00
continue ;
2011-02-10 01:30:26 +03:00
if ( a - > addr . a6 [ i ] < b - > addr . a6 [ i ] )
2010-11-30 23:08:53 +03:00
return - 1 ;
return 1 ;
}
return 0 ;
}
2011-03-05 01:33:59 +03:00
# define rcu_deref_locked(X, BASE) \
rcu_dereference_protected ( X , lockdep_is_held ( & ( BASE ) - > lock . lock ) )
2007-03-07 07:23:10 +03:00
/*
* Called with local BH disabled and the pool lock held .
*/
2010-11-30 22:41:59 +03:00
# define lookup(_daddr, _stack, _base) \
2005-04-17 02:20:36 +04:00
( { \
2010-10-26 03:55:38 +04:00
struct inet_peer * u ; \
struct inet_peer __rcu * * v ; \
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
\
stackptr = _stack ; \
2010-11-30 22:41:59 +03:00
* stackptr + + = & _base - > root ; \
2011-03-05 01:33:59 +03:00
for ( u = rcu_deref_locked ( _base - > root , _base ) ; \
2010-10-26 03:55:38 +04:00
u ! = peer_avl_empty ; ) { \
2010-11-30 23:08:53 +03:00
int cmp = addr_compare ( _daddr , & u - > daddr ) ; \
if ( cmp = = 0 ) \
2005-04-17 02:20:36 +04:00
break ; \
2010-11-30 23:08:53 +03:00
if ( cmp = = - 1 ) \
2005-04-17 02:20:36 +04:00
v = & u - > avl_left ; \
else \
v = & u - > avl_right ; \
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
* stackptr + + = v ; \
2011-03-05 01:33:59 +03:00
u = rcu_deref_locked ( * v , _base ) ; \
2005-04-17 02:20:36 +04:00
} \
u ; \
} )
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
/*
2011-03-09 01:59:28 +03:00
* Called with rcu_read_lock ( )
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
* Because we hold no lock against a writer , its quite possible we fall
* in an endless loop .
* But every pointer we follow is guaranteed to be valid thanks to RCU .
* We exit from this function if number of links exceeds PEER_MAXDEPTH
*/
2011-03-09 01:59:28 +03:00
static struct inet_peer * lookup_rcu ( const struct inetpeer_addr * daddr ,
2011-06-08 17:35:34 +04:00
struct inet_peer_base * base )
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
{
2011-03-09 01:59:28 +03:00
struct inet_peer * u = rcu_dereference ( base - > root ) ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
int count = 0 ;
while ( u ! = peer_avl_empty ) {
2010-11-30 23:08:53 +03:00
int cmp = addr_compare ( daddr , & u - > daddr ) ;
if ( cmp = = 0 ) {
2010-06-16 08:47:39 +04:00
/* Before taking a reference, check if this entry was
2011-06-08 17:35:34 +04:00
* deleted ( refcnt = - 1 )
2010-06-16 08:47:39 +04:00
*/
2011-06-08 17:35:34 +04:00
if ( ! atomic_add_unless ( & u - > refcnt , 1 , - 1 ) )
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
u = NULL ;
return u ;
}
2010-11-30 23:08:53 +03:00
if ( cmp = = - 1 )
2011-03-09 01:59:28 +03:00
u = rcu_dereference ( u - > avl_left ) ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
else
2011-03-09 01:59:28 +03:00
u = rcu_dereference ( u - > avl_right ) ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
if ( unlikely ( + + count = = PEER_MAXDEPTH ) )
break ;
}
return NULL ;
}
/* Called with local BH disabled and the pool lock held. */
2010-11-30 22:41:59 +03:00
# define lookup_rightempty(start, base) \
2005-04-17 02:20:36 +04:00
( { \
2010-10-26 03:55:38 +04:00
struct inet_peer * u ; \
struct inet_peer __rcu * * v ; \
2005-04-17 02:20:36 +04:00
* stackptr + + = & start - > avl_left ; \
v = & start - > avl_left ; \
2011-03-05 01:33:59 +03:00
for ( u = rcu_deref_locked ( * v , base ) ; \
2010-10-26 03:55:38 +04:00
u - > avl_right ! = peer_avl_empty_rcu ; ) { \
2005-04-17 02:20:36 +04:00
v = & u - > avl_right ; \
* stackptr + + = v ; \
2011-03-05 01:33:59 +03:00
u = rcu_deref_locked ( * v , base ) ; \
2005-04-17 02:20:36 +04:00
} \
u ; \
} )
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
/* Called with local BH disabled and the pool lock held.
2005-04-17 02:20:36 +04:00
* Variable names are the proof of operation correctness .
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
* Look into mm / map_avl . c for more detail description of the ideas .
*/
2010-10-26 03:55:38 +04:00
static void peer_avl_rebalance ( struct inet_peer __rcu * * stack [ ] ,
2010-11-30 22:41:59 +03:00
struct inet_peer __rcu * * * stackend ,
struct inet_peer_base * base )
2005-04-17 02:20:36 +04:00
{
2010-10-26 03:55:38 +04:00
struct inet_peer __rcu * * nodep ;
struct inet_peer * node , * l , * r ;
2005-04-17 02:20:36 +04:00
int lh , rh ;
while ( stackend > stack ) {
nodep = * - - stackend ;
2011-03-05 01:33:59 +03:00
node = rcu_deref_locked ( * nodep , base ) ;
l = rcu_deref_locked ( node - > avl_left , base ) ;
r = rcu_deref_locked ( node - > avl_right , base ) ;
2005-04-17 02:20:36 +04:00
lh = node_height ( l ) ;
rh = node_height ( r ) ;
if ( lh > rh + 1 ) { /* l: RH+2 */
struct inet_peer * ll , * lr , * lrl , * lrr ;
int lrh ;
2011-03-05 01:33:59 +03:00
ll = rcu_deref_locked ( l - > avl_left , base ) ;
lr = rcu_deref_locked ( l - > avl_right , base ) ;
2005-04-17 02:20:36 +04:00
lrh = node_height ( lr ) ;
if ( lrh < = node_height ( ll ) ) { /* ll: RH+1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( node - > avl_left , lr ) ; /* lr: RH or RH+1 */
RCU_INIT_POINTER ( node - > avl_right , r ) ; /* r: RH */
2005-04-17 02:20:36 +04:00
node - > avl_height = lrh + 1 ; /* RH+1 or RH+2 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( l - > avl_left , ll ) ; /* ll: RH+1 */
RCU_INIT_POINTER ( l - > avl_right , node ) ; /* node: RH+1 or RH+2 */
2005-04-17 02:20:36 +04:00
l - > avl_height = node - > avl_height + 1 ;
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( * nodep , l ) ;
2005-04-17 02:20:36 +04:00
} else { /* ll: RH, lr: RH+1 */
2011-03-05 01:33:59 +03:00
lrl = rcu_deref_locked ( lr - > avl_left , base ) ; /* lrl: RH or RH-1 */
lrr = rcu_deref_locked ( lr - > avl_right , base ) ; /* lrr: RH or RH-1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( node - > avl_left , lrr ) ; /* lrr: RH or RH-1 */
RCU_INIT_POINTER ( node - > avl_right , r ) ; /* r: RH */
2005-04-17 02:20:36 +04:00
node - > avl_height = rh + 1 ; /* node: RH+1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( l - > avl_left , ll ) ; /* ll: RH */
RCU_INIT_POINTER ( l - > avl_right , lrl ) ; /* lrl: RH or RH-1 */
2005-04-17 02:20:36 +04:00
l - > avl_height = rh + 1 ; /* l: RH+1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( lr - > avl_left , l ) ; /* l: RH+1 */
RCU_INIT_POINTER ( lr - > avl_right , node ) ; /* node: RH+1 */
2005-04-17 02:20:36 +04:00
lr - > avl_height = rh + 2 ;
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( * nodep , lr ) ;
2005-04-17 02:20:36 +04:00
}
} else if ( rh > lh + 1 ) { /* r: LH+2 */
struct inet_peer * rr , * rl , * rlr , * rll ;
int rlh ;
2011-03-05 01:33:59 +03:00
rr = rcu_deref_locked ( r - > avl_right , base ) ;
rl = rcu_deref_locked ( r - > avl_left , base ) ;
2005-04-17 02:20:36 +04:00
rlh = node_height ( rl ) ;
if ( rlh < = node_height ( rr ) ) { /* rr: LH+1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( node - > avl_right , rl ) ; /* rl: LH or LH+1 */
RCU_INIT_POINTER ( node - > avl_left , l ) ; /* l: LH */
2005-04-17 02:20:36 +04:00
node - > avl_height = rlh + 1 ; /* LH+1 or LH+2 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( r - > avl_right , rr ) ; /* rr: LH+1 */
RCU_INIT_POINTER ( r - > avl_left , node ) ; /* node: LH+1 or LH+2 */
2005-04-17 02:20:36 +04:00
r - > avl_height = node - > avl_height + 1 ;
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( * nodep , r ) ;
2005-04-17 02:20:36 +04:00
} else { /* rr: RH, rl: RH+1 */
2011-03-05 01:33:59 +03:00
rlr = rcu_deref_locked ( rl - > avl_right , base ) ; /* rlr: LH or LH-1 */
rll = rcu_deref_locked ( rl - > avl_left , base ) ; /* rll: LH or LH-1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( node - > avl_right , rll ) ; /* rll: LH or LH-1 */
RCU_INIT_POINTER ( node - > avl_left , l ) ; /* l: LH */
2005-04-17 02:20:36 +04:00
node - > avl_height = lh + 1 ; /* node: LH+1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( r - > avl_right , rr ) ; /* rr: LH */
RCU_INIT_POINTER ( r - > avl_left , rlr ) ; /* rlr: LH or LH-1 */
2005-04-17 02:20:36 +04:00
r - > avl_height = lh + 1 ; /* r: LH+1 */
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( rl - > avl_right , r ) ; /* r: LH+1 */
RCU_INIT_POINTER ( rl - > avl_left , node ) ; /* node: LH+1 */
2005-04-17 02:20:36 +04:00
rl - > avl_height = lh + 2 ;
2010-10-26 03:55:38 +04:00
RCU_INIT_POINTER ( * nodep , rl ) ;
2005-04-17 02:20:36 +04:00
}
} else {
node - > avl_height = ( lh > rh ? lh : rh ) + 1 ;
}
}
}
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
/* Called with local BH disabled and the pool lock held. */
2010-11-30 22:41:59 +03:00
# define link_to_pool(n, base) \
2005-04-17 02:20:36 +04:00
do { \
n - > avl_height = 1 ; \
2010-10-26 03:55:38 +04:00
n - > avl_left = peer_avl_empty_rcu ; \
n - > avl_right = peer_avl_empty_rcu ; \
/* lockless readers can catch us now */ \
rcu_assign_pointer ( * * - - stackptr , n ) ; \
2010-11-30 22:41:59 +03:00
peer_avl_rebalance ( stack , stackptr , base ) ; \
2010-06-14 23:35:21 +04:00
} while ( 0 )
2005-04-17 02:20:36 +04:00
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
static void inetpeer_free_rcu ( struct rcu_head * head )
{
kmem_cache_free ( peer_cachep , container_of ( head , struct inet_peer , rcu ) ) ;
}
2011-04-12 02:39:40 +04:00
static void unlink_from_pool ( struct inet_peer * p , struct inet_peer_base * base ,
struct inet_peer __rcu * * stack [ PEER_MAXDEPTH ] )
2005-04-17 02:20:36 +04:00
{
2011-06-08 17:35:34 +04:00
struct inet_peer __rcu * * * stackptr , * * * delp ;
if ( lookup ( & p - > daddr , stack , base ) ! = p )
BUG ( ) ;
delp = stackptr - 1 ; /* *delp[0] == p */
if ( p - > avl_left = = peer_avl_empty_rcu ) {
* delp [ 0 ] = p - > avl_right ;
- - stackptr ;
} else {
/* look for a node to insert instead of p */
struct inet_peer * t ;
t = lookup_rightempty ( p , base ) ;
BUG_ON ( rcu_deref_locked ( * stackptr [ - 1 ] , base ) ! = t ) ;
* * - - stackptr = t - > avl_left ;
/* t is removed, t->daddr > x->daddr for any
* x in p - > avl_left subtree .
* Put t in the old place of p . */
RCU_INIT_POINTER ( * delp [ 0 ] , t ) ;
t - > avl_left = p - > avl_left ;
t - > avl_right = p - > avl_right ;
t - > avl_height = p - > avl_height ;
BUG_ON ( delp [ 1 ] ! = & p - > avl_left ) ;
delp [ 1 ] = & t - > avl_left ; /* was &p->avl_left */
2005-04-17 02:20:36 +04:00
}
2011-06-08 17:35:34 +04:00
peer_avl_rebalance ( stack , stackptr , base ) ;
base - > total - - ;
call_rcu ( & p - > rcu , inetpeer_free_rcu ) ;
2005-04-17 02:20:36 +04:00
}
2010-11-30 23:12:23 +03:00
static struct inet_peer_base * family_to_base ( int family )
{
2011-06-08 17:35:34 +04:00
return family = = AF_INET ? & v4_peers : & v6_peers ;
2010-11-30 23:12:23 +03:00
}
2011-06-08 17:35:34 +04:00
/* perform garbage collect on all items stacked during a lookup */
static int inet_peer_gc ( struct inet_peer_base * base ,
struct inet_peer __rcu * * stack [ PEER_MAXDEPTH ] ,
struct inet_peer __rcu * * * stackptr )
2010-11-30 22:41:59 +03:00
{
2011-06-08 17:35:34 +04:00
struct inet_peer * p , * gchead = NULL ;
__u32 delta , ttl ;
int cnt = 0 ;
2007-11-13 08:27:28 +03:00
2011-06-08 17:35:34 +04:00
if ( base - > total > = inet_peer_threshold )
ttl = 0 ; /* be aggressive */
else
ttl = inet_peer_maxttl
- ( inet_peer_maxttl - inet_peer_minttl ) / HZ *
base - > total / inet_peer_threshold * HZ ;
stackptr - - ; /* last stack slot is peer_avl_empty */
while ( stackptr > stack ) {
stackptr - - ;
p = rcu_deref_locked ( * * stackptr , base ) ;
2011-07-11 06:49:52 +04:00
if ( atomic_read ( & p - > refcnt ) = = 0 ) {
smp_rmb ( ) ;
delta = ( __u32 ) jiffies - p - > dtime ;
if ( delta > = ttl & &
atomic_cmpxchg ( & p - > refcnt , 0 , - 1 ) = = 0 ) {
p - > gc_next = gchead ;
gchead = p ;
}
2005-04-17 02:20:36 +04:00
}
}
2011-06-08 17:35:34 +04:00
while ( ( p = gchead ) ! = NULL ) {
gchead = p - > gc_next ;
cnt + + ;
unlink_from_pool ( p , base , stack ) ;
}
return cnt ;
2005-04-17 02:20:36 +04:00
}
2011-07-22 08:25:58 +04:00
struct inet_peer * inet_getpeer ( const struct inetpeer_addr * daddr , int create )
2005-04-17 02:20:36 +04:00
{
2010-10-26 03:55:38 +04:00
struct inet_peer __rcu * * stack [ PEER_MAXDEPTH ] , * * * stackptr ;
2011-01-25 01:37:46 +03:00
struct inet_peer_base * base = family_to_base ( daddr - > family ) ;
2010-11-30 22:41:59 +03:00
struct inet_peer * p ;
2011-03-05 01:33:59 +03:00
unsigned int sequence ;
2011-06-08 17:35:34 +04:00
int invalidated , gccnt = 0 ;
2005-04-17 02:20:36 +04:00
2011-06-08 17:35:34 +04:00
/* Attempt a lockless lookup first.
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
* Because of a concurrent writer , we might not find an existing entry .
*/
2011-03-09 01:59:28 +03:00
rcu_read_lock ( ) ;
2011-03-05 01:33:59 +03:00
sequence = read_seqbegin ( & base - > lock ) ;
2011-06-08 17:35:34 +04:00
p = lookup_rcu ( daddr , base ) ;
2011-03-05 01:33:59 +03:00
invalidated = read_seqretry ( & base - > lock , sequence ) ;
2011-03-09 01:59:28 +03:00
rcu_read_unlock ( ) ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
2011-06-08 17:35:34 +04:00
if ( p )
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
return p ;
2005-04-17 02:20:36 +04:00
2011-03-05 01:33:59 +03:00
/* If no writer did a change during our lookup, we can return early. */
if ( ! create & & ! invalidated )
return NULL ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
/* retry an exact lookup, taking the lock before.
* At least , nodes should be hot in our cache .
*/
2011-03-05 01:33:59 +03:00
write_seqlock_bh ( & base - > lock ) ;
2011-06-08 17:35:34 +04:00
relookup :
2010-11-30 23:08:53 +03:00
p = lookup ( daddr , stack , base ) ;
2005-04-17 02:20:36 +04:00
if ( p ! = peer_avl_empty ) {
2011-06-08 17:35:34 +04:00
atomic_inc ( & p - > refcnt ) ;
2011-03-05 01:33:59 +03:00
write_sequnlock_bh ( & base - > lock ) ;
2011-06-08 17:35:34 +04:00
return p ;
}
if ( ! gccnt ) {
gccnt = inet_peer_gc ( base , stack , stackptr ) ;
if ( gccnt & & create )
goto relookup ;
2005-04-17 02:20:36 +04:00
}
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
p = create ? kmem_cache_alloc ( peer_cachep , GFP_ATOMIC ) : NULL ;
if ( p ) {
2010-11-30 22:54:19 +03:00
p - > daddr = * daddr ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
atomic_set ( & p - > refcnt , 1 ) ;
atomic_set ( & p - > rid , 0 ) ;
2011-07-22 08:25:58 +04:00
atomic_set ( & p - > ip_id_count ,
( daddr - > family = = AF_INET ) ?
secure_ip_id ( daddr - > addr . a4 ) :
secure_ipv6_id ( daddr - > addr . a6 ) ) ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
p - > tcp_ts_stamp = 0 ;
2011-01-28 00:52:16 +03:00
p - > metrics [ RTAX_LOCK - 1 ] = INETPEER_METRICS_NEW ;
2011-02-05 02:55:25 +03:00
p - > rate_tokens = 0 ;
p - > rate_last = 0 ;
2011-02-10 02:36:47 +03:00
p - > pmtu_expires = 0 ;
2011-03-09 23:09:58 +03:00
p - > pmtu_orig = 0 ;
2011-02-10 02:36:47 +03:00
memset ( & p - > redirect_learned , 0 , sizeof ( p - > redirect_learned ) ) ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
/* Link the node. */
2010-11-30 22:41:59 +03:00
link_to_pool ( p , base ) ;
base - > total + + ;
inetpeer: RCU conversion
inetpeer currently uses an AVL tree protected by an rwlock.
It's possible to make most lookups use RCU
1) Add a struct rcu_head to struct inet_peer
2) add a lookup_rcu_bh() helper to perform lockless and opportunistic
lookup. This is a normal function, not a macro like lookup().
3) Add a limit to number of links followed by lookup_rcu_bh(). This is
needed in case we fall in a loop.
4) add an smp_wmb() in link_to_pool() right before node insert.
5) make unlink_from_pool() use atomic_cmpxchg() to make sure it can take
last reference to an inet_peer, since lockless readers could increase
refcount, even while we hold peers.lock.
6) Delay struct inet_peer freeing after rcu grace period so that
lookup_rcu_bh() cannot crash.
7) inet_getpeer() first attempts lockless lookup.
Note this lookup can fail even if target is in AVL tree, but a
concurrent writer can let tree in a non correct form.
If this attemps fails, lock is taken a regular lookup is performed
again.
8) convert peers.lock from rwlock to a spinlock
9) Remove SLAB_HWCACHE_ALIGN when peer_cachep is created, because
rcu_head adds 16 bytes on 64bit arches, doubling effective size (64 ->
128 bytes)
In a future patch, this is probably possible to revert this part, if rcu
field is put in an union to share space with rid, ip_id_count, tcp_ts &
tcp_ts_stamp. These fields being manipulated only with refcnt > 0.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-06-15 12:23:14 +04:00
}
2011-03-05 01:33:59 +03:00
write_sequnlock_bh ( & base - > lock ) ;
2005-04-17 02:20:36 +04:00
return p ;
}
2010-11-30 23:27:11 +03:00
EXPORT_SYMBOL_GPL ( inet_getpeer ) ;
2010-11-30 22:41:59 +03:00
2006-10-13 08:21:06 +04:00
void inet_putpeer ( struct inet_peer * p )
{
2011-06-08 17:35:34 +04:00
p - > dtime = ( __u32 ) jiffies ;
2011-07-11 06:49:52 +04:00
smp_mb__before_atomic_dec ( ) ;
2011-06-08 17:35:34 +04:00
atomic_dec ( & p - > refcnt ) ;
2006-10-13 08:21:06 +04:00
}
2010-11-30 23:27:11 +03:00
EXPORT_SYMBOL_GPL ( inet_putpeer ) ;
2011-02-05 02:55:25 +03:00
/*
* Check transmit rate limitation for given message .
* The rate information is held in the inet_peer entries now .
* This function is generic and could be used for other purposes
* too . It uses a Token bucket filter as suggested by Alexey Kuznetsov .
*
* Note that the same inet_peer fields are modified by functions in
* route . c too , but these work for packet destinations while xrlim_allow
* works for icmp destinations . This means the rate limiting information
* for one " ip object " is shared - and these ICMPs are twice limited :
* by source and by destination .
*
* RFC 1812 : 4.3 .2 .8 SHOULD be able to limit error message rate
* SHOULD allow setting of rate limits
*
* Shared between ICMPv4 and ICMPv6 .
*/
# define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow ( struct inet_peer * peer , int timeout )
{
unsigned long now , token ;
bool rc = false ;
if ( ! peer )
return true ;
token = peer - > rate_tokens ;
now = jiffies ;
token + = now - peer - > rate_last ;
peer - > rate_last = now ;
if ( token > XRLIM_BURST_FACTOR * timeout )
token = XRLIM_BURST_FACTOR * timeout ;
if ( token > = timeout ) {
token - = timeout ;
rc = true ;
}
peer - > rate_tokens = token ;
return rc ;
}
EXPORT_SYMBOL ( inet_peer_xrlim_allow ) ;