2005-04-17 02:20:36 +04:00
/*
* net / core / dst . c Protocol independent destination cache .
*
* Authors : Alexey Kuznetsov , < kuznet @ ms2 . inr . ac . ru >
*
*/
# include <linux/bitops.h>
# include <linux/errno.h>
# include <linux/init.h>
# include <linux/kernel.h>
2007-09-12 16:29:01 +04:00
# include <linux/workqueue.h>
2005-04-17 02:20:36 +04:00
# include <linux/mm.h>
# include <linux/module.h>
# include <linux/netdevice.h>
# include <linux/skbuff.h>
# include <linux/string.h>
# include <linux/types.h>
2007-09-12 15:02:17 +04:00
# include <net/net_namespace.h>
2005-04-17 02:20:36 +04:00
2007-09-27 09:10:56 +04:00
# include <net/net_namespace.h>
2005-04-17 02:20:36 +04:00
# include <net/dst.h>
2007-09-12 16:29:01 +04:00
/*
* Theory of operations :
* 1 ) We use a list , protected by a spinlock , to add
* new entries from both BH and non - BH context .
* 2 ) In order to keep spinlock held for a small delay ,
* we use a second list where are stored long lived
* entries , that are handled by the garbage collect thread
* fired by a workqueue .
* 3 ) This list is guarded by a mutex ,
* so that the gc_task and dst_dev_event ( ) can be synchronized .
2005-04-17 02:20:36 +04:00
*/
2007-02-09 17:24:36 +03:00
# if RT_CACHE_DEBUG >= 2
2005-04-17 02:20:36 +04:00
static atomic_t dst_total = ATOMIC_INIT ( 0 ) ;
# endif
2007-09-12 16:29:01 +04:00
/*
* We want to keep lock & list close together
* to dirty as few cache lines as possible in __dst_free ( ) .
* As this is not a very strong hint , we dont force an alignment on SMP .
*/
static struct {
spinlock_t lock ;
struct dst_entry * list ;
unsigned long timer_inc ;
unsigned long timer_expires ;
} dst_garbage = {
. lock = __SPIN_LOCK_UNLOCKED ( dst_garbage . lock ) ,
. timer_inc = DST_GC_MAX ,
} ;
static void dst_gc_task ( struct work_struct * work ) ;
2005-04-17 02:20:36 +04:00
static void ___dst_free ( struct dst_entry * dst ) ;
2007-09-12 16:29:01 +04:00
static DECLARE_DELAYED_WORK ( dst_gc_work , dst_gc_task ) ;
2005-04-17 02:20:36 +04:00
2007-09-12 16:29:01 +04:00
static DEFINE_MUTEX ( dst_gc_mutex ) ;
/*
* long lived entries are maintained in this list , guarded by dst_gc_mutex
*/
static struct dst_entry * dst_busy_list ;
static void dst_gc_task ( struct work_struct * work )
2005-04-17 02:20:36 +04:00
{
int delayed = 0 ;
2007-09-12 16:29:01 +04:00
int work_performed = 0 ;
unsigned long expires = ~ 0L ;
struct dst_entry * dst , * next , head ;
struct dst_entry * last = & head ;
# if RT_CACHE_DEBUG >= 2
ktime_t time_start = ktime_get ( ) ;
struct timespec elapsed ;
# endif
2005-04-17 02:20:36 +04:00
2007-09-12 16:29:01 +04:00
mutex_lock ( & dst_gc_mutex ) ;
next = dst_busy_list ;
2005-04-17 02:20:36 +04:00
2007-09-12 16:29:01 +04:00
loop :
while ( ( dst = next ) ! = NULL ) {
next = dst - > next ;
prefetch ( & next - > next ) ;
if ( likely ( atomic_read ( & dst - > __refcnt ) ) ) {
last - > next = dst ;
last = dst ;
2005-04-17 02:20:36 +04:00
delayed + + ;
continue ;
}
2007-09-12 16:29:01 +04:00
work_performed + + ;
2005-04-17 02:20:36 +04:00
dst = dst_destroy ( dst ) ;
if ( dst ) {
/* NOHASH and still referenced. Unless it is already
* on gc list , invalidate it and add to gc list .
*
* Note : this is temporary . Actually , NOHASH dst ' s
* must be obsoleted when parent is obsoleted .
* But we do not have state " obsoleted, but
* referenced by parent " , so it is right.
*/
if ( dst - > obsolete > 1 )
continue ;
___dst_free ( dst ) ;
2007-09-12 16:29:01 +04:00
dst - > next = next ;
next = dst ;
2005-04-17 02:20:36 +04:00
}
}
2007-09-12 16:29:01 +04:00
spin_lock_bh ( & dst_garbage . lock ) ;
next = dst_garbage . list ;
if ( next ) {
dst_garbage . list = NULL ;
spin_unlock_bh ( & dst_garbage . lock ) ;
goto loop ;
2005-04-17 02:20:36 +04:00
}
2007-09-12 16:29:01 +04:00
last - > next = NULL ;
dst_busy_list = head . next ;
if ( ! dst_busy_list )
dst_garbage . timer_inc = DST_GC_MAX ;
else {
/*
* if we freed less than 1 / 10 of delayed entries ,
* we can sleep longer .
*/
if ( work_performed < = delayed / 10 ) {
dst_garbage . timer_expires + = dst_garbage . timer_inc ;
if ( dst_garbage . timer_expires > DST_GC_MAX )
dst_garbage . timer_expires = DST_GC_MAX ;
dst_garbage . timer_inc + = DST_GC_INC ;
} else {
dst_garbage . timer_inc = DST_GC_INC ;
dst_garbage . timer_expires = DST_GC_MIN ;
}
expires = dst_garbage . timer_expires ;
/*
* if the next desired timer is more than 4 seconds in the future
* then round the timer to whole seconds
*/
if ( expires > 4 * HZ )
expires = round_jiffies_relative ( expires ) ;
schedule_delayed_work ( & dst_gc_work , expires ) ;
2005-07-31 04:47:25 +04:00
}
2007-09-12 16:29:01 +04:00
spin_unlock_bh ( & dst_garbage . lock ) ;
mutex_unlock ( & dst_gc_mutex ) ;
2005-04-17 02:20:36 +04:00
# if RT_CACHE_DEBUG >= 2
2007-09-12 16:29:01 +04:00
elapsed = ktime_to_timespec ( ktime_sub ( ktime_get ( ) , time_start ) ) ;
printk ( KERN_DEBUG " dst_total: %d delayed: %d work_perf: %d "
" expires: %lu elapsed: %lu us \n " ,
atomic_read ( & dst_total ) , delayed , work_performed ,
expires ,
elapsed . tv_sec * USEC_PER_SEC + elapsed . tv_nsec / NSEC_PER_USEC ) ;
2005-04-17 02:20:36 +04:00
# endif
}
2007-06-05 11:06:57 +04:00
static int dst_discard ( struct sk_buff * skb )
2005-04-17 02:20:36 +04:00
{
kfree_skb ( skb ) ;
return 0 ;
}
void * dst_alloc ( struct dst_ops * ops )
{
struct dst_entry * dst ;
if ( ops - > gc & & atomic_read ( & ops - > entries ) > ops - > gc_thresh ) {
if ( ops - > gc ( ) )
return NULL ;
}
2007-02-10 12:45:03 +03:00
dst = kmem_cache_zalloc ( ops - > kmem_cachep , GFP_ATOMIC ) ;
2005-04-17 02:20:36 +04:00
if ( ! dst )
return NULL ;
atomic_set ( & dst - > __refcnt , 0 ) ;
dst - > ops = ops ;
dst - > lastuse = jiffies ;
dst - > path = dst ;
2007-06-05 11:06:57 +04:00
dst - > input = dst - > output = dst_discard ;
2007-02-09 17:24:36 +03:00
# if RT_CACHE_DEBUG >= 2
2005-04-17 02:20:36 +04:00
atomic_inc ( & dst_total ) ;
# endif
atomic_inc ( & ops - > entries ) ;
return dst ;
}
static void ___dst_free ( struct dst_entry * dst )
{
/* The first case (dev==NULL) is required, when
protocol module is unloaded .
*/
if ( dst - > dev = = NULL | | ! ( dst - > dev - > flags & IFF_UP ) ) {
2007-06-05 11:06:57 +04:00
dst - > input = dst - > output = dst_discard ;
2005-04-17 02:20:36 +04:00
}
dst - > obsolete = 2 ;
}
void __dst_free ( struct dst_entry * dst )
{
2007-09-12 16:29:01 +04:00
spin_lock_bh ( & dst_garbage . lock ) ;
2005-04-17 02:20:36 +04:00
___dst_free ( dst ) ;
2007-09-12 16:29:01 +04:00
dst - > next = dst_garbage . list ;
dst_garbage . list = dst ;
if ( dst_garbage . timer_inc > DST_GC_INC ) {
dst_garbage . timer_inc = DST_GC_INC ;
dst_garbage . timer_expires = DST_GC_MIN ;
schedule_delayed_work ( & dst_gc_work , dst_garbage . timer_expires ) ;
2005-04-17 02:20:36 +04:00
}
2007-09-12 16:29:01 +04:00
spin_unlock_bh ( & dst_garbage . lock ) ;
2005-04-17 02:20:36 +04:00
}
struct dst_entry * dst_destroy ( struct dst_entry * dst )
{
struct dst_entry * child ;
struct neighbour * neigh ;
struct hh_cache * hh ;
smp_rmb ( ) ;
again :
neigh = dst - > neighbour ;
hh = dst - > hh ;
child = dst - > child ;
dst - > hh = NULL ;
if ( hh & & atomic_dec_and_test ( & hh - > hh_refcnt ) )
kfree ( hh ) ;
if ( neigh ) {
dst - > neighbour = NULL ;
neigh_release ( neigh ) ;
}
atomic_dec ( & dst - > ops - > entries ) ;
if ( dst - > ops - > destroy )
dst - > ops - > destroy ( dst ) ;
if ( dst - > dev )
dev_put ( dst - > dev ) ;
2007-02-09 17:24:36 +03:00
# if RT_CACHE_DEBUG >= 2
2005-04-17 02:20:36 +04:00
atomic_dec ( & dst_total ) ;
# endif
kmem_cache_free ( dst - > ops - > kmem_cachep , dst ) ;
dst = child ;
if ( dst ) {
2005-04-17 02:24:10 +04:00
int nohash = dst - > flags & DST_NOHASH ;
2005-04-17 02:20:36 +04:00
if ( atomic_dec_and_test ( & dst - > __refcnt ) ) {
/* We were real parent of this dst, so kill child. */
2005-04-17 02:24:10 +04:00
if ( nohash )
2005-04-17 02:20:36 +04:00
goto again ;
} else {
/* Child is still referenced, return it for freeing. */
2005-04-17 02:24:10 +04:00
if ( nohash )
2005-04-17 02:20:36 +04:00
return dst ;
/* Child is still in his hash table */
}
}
return NULL ;
}
/* Dirty hack. We did it in 2.2 (in __dst_free),
* we have _very_ good reasons not to repeat
* this mistake in 2.3 , but we have no choice
* now . _It_ _is_ _explicit_ _deliberate_
* _race_ _condition_ .
*
* Commented and originally written by Alexey .
*/
static inline void dst_ifdown ( struct dst_entry * dst , struct net_device * dev ,
int unregister )
{
if ( dst - > ops - > ifdown )
dst - > ops - > ifdown ( dst , dev , unregister ) ;
if ( dev ! = dst - > dev )
return ;
if ( ! unregister ) {
2007-06-05 11:06:57 +04:00
dst - > input = dst - > output = dst_discard ;
2005-04-17 02:20:36 +04:00
} else {
2007-09-27 09:10:56 +04:00
dst - > dev = init_net . loopback_dev ;
2007-09-26 06:16:28 +04:00
dev_hold ( dst - > dev ) ;
2005-04-17 02:20:36 +04:00
dev_put ( dev ) ;
if ( dst - > neighbour & & dst - > neighbour - > dev = = dev ) {
2007-09-27 09:10:56 +04:00
dst - > neighbour - > dev = init_net . loopback_dev ;
2005-04-17 02:20:36 +04:00
dev_put ( dev ) ;
2007-09-26 06:16:28 +04:00
dev_hold ( dst - > neighbour - > dev ) ;
2005-04-17 02:20:36 +04:00
}
}
}
static int dst_dev_event ( struct notifier_block * this , unsigned long event , void * ptr )
{
struct net_device * dev = ptr ;
2007-09-12 16:29:01 +04:00
struct dst_entry * dst , * last = NULL ;
2005-04-17 02:20:36 +04:00
2007-09-12 15:02:17 +04:00
if ( dev - > nd_net ! = & init_net )
return NOTIFY_DONE ;
2005-04-17 02:20:36 +04:00
switch ( event ) {
case NETDEV_UNREGISTER :
case NETDEV_DOWN :
2007-09-12 16:29:01 +04:00
mutex_lock ( & dst_gc_mutex ) ;
for ( dst = dst_busy_list ; dst ; dst = dst - > next ) {
last = dst ;
dst_ifdown ( dst , dev , event ! = NETDEV_DOWN ) ;
}
spin_lock_bh ( & dst_garbage . lock ) ;
dst = dst_garbage . list ;
dst_garbage . list = NULL ;
spin_unlock_bh ( & dst_garbage . lock ) ;
if ( last )
last - > next = dst ;
else
dst_busy_list = dst ;
for ( ; dst ; dst = dst - > next ) {
2005-04-17 02:20:36 +04:00
dst_ifdown ( dst , dev , event ! = NETDEV_DOWN ) ;
}
2007-09-12 16:29:01 +04:00
mutex_unlock ( & dst_gc_mutex ) ;
2005-04-17 02:20:36 +04:00
break ;
}
return NOTIFY_DONE ;
}
static struct notifier_block dst_dev_notifier = {
. notifier_call = dst_dev_event ,
} ;
void __init dst_init ( void )
{
register_netdevice_notifier ( & dst_dev_notifier ) ;
}
EXPORT_SYMBOL ( __dst_free ) ;
EXPORT_SYMBOL ( dst_alloc ) ;
EXPORT_SYMBOL ( dst_destroy ) ;