2005-04-17 02:20:36 +04:00
/*
* net / sched / sch_generic . c Generic packet scheduler routines .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*
* Authors : Alexey Kuznetsov , < kuznet @ ms2 . inr . ac . ru >
* Jamal Hadi Salim , < hadi @ cyberus . ca > 990601
* - Ingress support
*/
# include <asm/uaccess.h>
# include <asm/system.h>
# include <linux/bitops.h>
# include <linux/module.h>
# include <linux/types.h>
# include <linux/kernel.h>
# include <linux/sched.h>
# include <linux/string.h>
# include <linux/mm.h>
# include <linux/socket.h>
# include <linux/sockios.h>
# include <linux/in.h>
# include <linux/errno.h>
# include <linux/interrupt.h>
# include <linux/netdevice.h>
# include <linux/skbuff.h>
# include <linux/rtnetlink.h>
# include <linux/init.h>
# include <linux/rcupdate.h>
# include <linux/list.h>
# include <net/sock.h>
# include <net/pkt_sched.h>
/* Main transmission queue. */
2007-04-17 04:02:10 +04:00
/* Modifications to data participating in scheduling must be protected with
* dev - > queue_lock spinlock .
*
* The idea is the following :
* - enqueue , dequeue are serialized via top level device
* spinlock dev - > queue_lock .
* - updates to tree and tree walking are only done under the rtnl mutex .
2005-04-17 02:20:36 +04:00
*/
void qdisc_lock_tree ( struct net_device * dev )
{
spin_lock_bh ( & dev - > queue_lock ) ;
}
void qdisc_unlock_tree ( struct net_device * dev )
{
spin_unlock_bh ( & dev - > queue_lock ) ;
}
2007-02-09 17:25:16 +03:00
/*
2005-04-17 02:20:36 +04:00
dev - > queue_lock serializes queue accesses for this device
AND dev - > qdisc pointer itself .
2006-06-09 23:20:56 +04:00
netif_tx_lock serializes accesses to device driver .
2005-04-17 02:20:36 +04:00
2006-06-09 23:20:56 +04:00
dev - > queue_lock and netif_tx_lock are mutually exclusive ,
2005-04-17 02:20:36 +04:00
if one is grabbed , another must be free .
*/
/* Kick device.
Note , that this procedure can be called by a watchdog timer , so that
we do not check dev - > tbusy flag here .
Returns : 0 - queue is empty .
2007-02-09 17:25:16 +03:00
> 0 - queue is not empty , but throttled .
2005-04-17 02:20:36 +04:00
< 0 - queue is not empty . Device is throttled , if dev - > tbusy ! = 0.
NOTE : Called under dev - > queue_lock with locally disabled BH .
*/
2006-06-20 10:57:59 +04:00
static inline int qdisc_restart ( struct net_device * dev )
2005-04-17 02:20:36 +04:00
{
struct Qdisc * q = dev - > qdisc ;
struct sk_buff * skb ;
/* Dequeue packet */
2006-06-22 13:57:17 +04:00
if ( ( ( skb = dev - > gso_skb ) ) | | ( ( skb = q - > dequeue ( q ) ) ) ) {
2005-04-17 02:20:36 +04:00
unsigned nolock = ( dev - > features & NETIF_F_LLTX ) ;
2006-06-22 13:57:17 +04:00
dev - > gso_skb = NULL ;
2005-04-17 02:20:36 +04:00
/*
* When the driver has LLTX set it does its own locking
* in start_xmit . No need to add additional overhead by
* locking again . These checks are worth it because
* even uncongested locks can be quite expensive .
* The driver can do trylock like here too , in case
* of lock congestion it should return - 1 and the packet
* will be requeued .
*/
if ( ! nolock ) {
2006-06-09 23:20:56 +04:00
if ( ! netif_tx_trylock ( dev ) ) {
2005-04-17 02:20:36 +04:00
collision :
/* So, someone grabbed the driver. */
2007-02-09 17:25:16 +03:00
2005-04-17 02:20:36 +04:00
/* It may be transient configuration error,
when hard_start_xmit ( ) recurses . We detect
it by checking xmit owner and drop the
packet when deadloop is detected .
*/
if ( dev - > xmit_lock_owner = = smp_processor_id ( ) ) {
kfree_skb ( skb ) ;
if ( net_ratelimit ( ) )
printk ( KERN_DEBUG " Dead loop on netdevice %s, fix it urgently! \n " , dev - > name ) ;
return - 1 ;
}
__get_cpu_var ( netdev_rx_stat ) . cpu_collision + + ;
goto requeue ;
}
}
2007-02-09 17:25:16 +03:00
2005-04-17 02:20:36 +04:00
{
/* And release queue */
spin_unlock ( & dev - > queue_lock ) ;
if ( ! netif_queue_stopped ( dev ) ) {
int ret ;
2006-06-22 13:57:17 +04:00
ret = dev_hard_start_xmit ( skb , dev ) ;
2007-02-09 17:25:16 +03:00
if ( ret = = NETDEV_TX_OK ) {
2005-04-17 02:20:36 +04:00
if ( ! nolock ) {
2006-06-09 23:20:56 +04:00
netif_tx_unlock ( dev ) ;
2005-04-17 02:20:36 +04:00
}
spin_lock ( & dev - > queue_lock ) ;
return - 1 ;
}
if ( ret = = NETDEV_TX_LOCKED & & nolock ) {
spin_lock ( & dev - > queue_lock ) ;
2007-02-09 17:25:16 +03:00
goto collision ;
2005-04-17 02:20:36 +04:00
}
}
/* NETDEV_TX_BUSY - we need to requeue */
/* Release the driver */
2007-02-09 17:25:16 +03:00
if ( ! nolock ) {
2006-06-09 23:20:56 +04:00
netif_tx_unlock ( dev ) ;
2007-02-09 17:25:16 +03:00
}
2005-04-17 02:20:36 +04:00
spin_lock ( & dev - > queue_lock ) ;
q = dev - > qdisc ;
}
/* Device kicked us out :(
This is possible in three cases :
0. driver is locked
1. fastroute is enabled
2. device cannot determine busy state
before start of transmission ( f . e . dialout )
3. device is buggy ( ppp )
*/
requeue :
2006-06-22 13:57:17 +04:00
if ( skb - > next )
dev - > gso_skb = skb ;
else
q - > ops - > requeue ( skb , q ) ;
2005-04-17 02:20:36 +04:00
netif_schedule ( dev ) ;
return 1 ;
}
2005-05-04 03:24:03 +04:00
BUG_ON ( ( int ) q - > q . qlen < 0 ) ;
2005-04-17 02:20:36 +04:00
return q - > q . qlen ;
}
2006-06-20 10:57:59 +04:00
void __qdisc_run ( struct net_device * dev )
{
2006-06-22 13:28:18 +04:00
if ( unlikely ( dev - > qdisc = = & noop_qdisc ) )
goto out ;
2006-06-20 10:57:59 +04:00
while ( qdisc_restart ( dev ) < 0 & & ! netif_queue_stopped ( dev ) )
/* NOTHING */ ;
2006-06-22 13:28:18 +04:00
out :
2006-06-20 10:57:59 +04:00
clear_bit ( __LINK_STATE_QDISC_RUNNING , & dev - > state ) ;
}
2005-04-17 02:20:36 +04:00
static void dev_watchdog ( unsigned long arg )
{
struct net_device * dev = ( struct net_device * ) arg ;
2006-06-09 23:20:56 +04:00
netif_tx_lock ( dev ) ;
2005-04-17 02:20:36 +04:00
if ( dev - > qdisc ! = & noop_qdisc ) {
if ( netif_device_present ( dev ) & &
netif_running ( dev ) & &
netif_carrier_ok ( dev ) ) {
if ( netif_queue_stopped ( dev ) & &
2006-05-17 02:02:12 +04:00
time_after ( jiffies , dev - > trans_start + dev - > watchdog_timeo ) ) {
printk ( KERN_INFO " NETDEV WATCHDOG: %s: transmit timed out \n " ,
dev - > name ) ;
2005-04-17 02:20:36 +04:00
dev - > tx_timeout ( dev ) ;
}
2007-02-06 04:59:51 +03:00
if ( ! mod_timer ( & dev - > watchdog_timer , round_jiffies ( jiffies + dev - > watchdog_timeo ) ) )
2005-04-17 02:20:36 +04:00
dev_hold ( dev ) ;
}
}
2006-06-09 23:20:56 +04:00
netif_tx_unlock ( dev ) ;
2005-04-17 02:20:36 +04:00
dev_put ( dev ) ;
}
static void dev_watchdog_init ( struct net_device * dev )
{
init_timer ( & dev - > watchdog_timer ) ;
dev - > watchdog_timer . data = ( unsigned long ) dev ;
dev - > watchdog_timer . function = dev_watchdog ;
}
void __netdev_watchdog_up ( struct net_device * dev )
{
if ( dev - > tx_timeout ) {
if ( dev - > watchdog_timeo < = 0 )
dev - > watchdog_timeo = 5 * HZ ;
if ( ! mod_timer ( & dev - > watchdog_timer , jiffies + dev - > watchdog_timeo ) )
dev_hold ( dev ) ;
}
}
static void dev_watchdog_up ( struct net_device * dev )
{
__netdev_watchdog_up ( dev ) ;
}
static void dev_watchdog_down ( struct net_device * dev )
{
2006-06-09 23:20:56 +04:00
netif_tx_lock_bh ( dev ) ;
2005-04-17 02:20:36 +04:00
if ( del_timer ( & dev - > watchdog_timer ) )
2006-03-21 09:32:28 +03:00
dev_put ( dev ) ;
2006-06-09 23:20:56 +04:00
netif_tx_unlock_bh ( dev ) ;
2005-04-17 02:20:36 +04:00
}
2005-08-12 02:32:53 +04:00
void netif_carrier_on ( struct net_device * dev )
{
if ( test_and_clear_bit ( __LINK_STATE_NOCARRIER , & dev - > state ) )
linkwatch_fire_event ( dev ) ;
if ( netif_running ( dev ) )
__netdev_watchdog_up ( dev ) ;
}
void netif_carrier_off ( struct net_device * dev )
{
if ( ! test_and_set_bit ( __LINK_STATE_NOCARRIER , & dev - > state ) )
linkwatch_fire_event ( dev ) ;
}
2005-04-17 02:20:36 +04:00
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
under all circumstances . It is difficult to invent anything faster or
cheaper .
*/
2005-06-19 09:59:08 +04:00
static int noop_enqueue ( struct sk_buff * skb , struct Qdisc * qdisc )
2005-04-17 02:20:36 +04:00
{
kfree_skb ( skb ) ;
return NET_XMIT_CN ;
}
2005-06-19 09:59:08 +04:00
static struct sk_buff * noop_dequeue ( struct Qdisc * qdisc )
2005-04-17 02:20:36 +04:00
{
return NULL ;
}
2005-06-19 09:59:08 +04:00
static int noop_requeue ( struct sk_buff * skb , struct Qdisc * qdisc )
2005-04-17 02:20:36 +04:00
{
if ( net_ratelimit ( ) )
2005-06-19 09:59:08 +04:00
printk ( KERN_DEBUG " %s deferred output. It is buggy. \n " ,
skb - > dev - > name ) ;
2005-04-17 02:20:36 +04:00
kfree_skb ( skb ) ;
return NET_XMIT_CN ;
}
struct Qdisc_ops noop_qdisc_ops = {
. id = " noop " ,
. priv_size = 0 ,
. enqueue = noop_enqueue ,
. dequeue = noop_dequeue ,
. requeue = noop_requeue ,
. owner = THIS_MODULE ,
} ;
struct Qdisc noop_qdisc = {
. enqueue = noop_enqueue ,
. dequeue = noop_dequeue ,
. flags = TCQ_F_BUILTIN ,
2007-02-09 17:25:16 +03:00
. ops = & noop_qdisc_ops ,
2005-04-17 02:20:36 +04:00
. list = LIST_HEAD_INIT ( noop_qdisc . list ) ,
} ;
static struct Qdisc_ops noqueue_qdisc_ops = {
. id = " noqueue " ,
. priv_size = 0 ,
. enqueue = noop_enqueue ,
. dequeue = noop_dequeue ,
. requeue = noop_requeue ,
. owner = THIS_MODULE ,
} ;
static struct Qdisc noqueue_qdisc = {
. enqueue = NULL ,
. dequeue = noop_dequeue ,
. flags = TCQ_F_BUILTIN ,
. ops = & noqueue_qdisc_ops ,
. list = LIST_HEAD_INIT ( noqueue_qdisc . list ) ,
} ;
static const u8 prio2band [ TC_PRIO_MAX + 1 ] =
{ 1 , 2 , 2 , 2 , 1 , 2 , 0 , 0 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 } ;
/* 3-band FIFO queue: old style, but should be a bit faster than
generic prio + fifo combination .
*/
2005-06-19 09:58:53 +04:00
# define PFIFO_FAST_BANDS 3
2005-06-19 09:58:35 +04:00
static inline struct sk_buff_head * prio2list ( struct sk_buff * skb ,
struct Qdisc * qdisc )
2005-04-17 02:20:36 +04:00
{
struct sk_buff_head * list = qdisc_priv ( qdisc ) ;
2005-06-19 09:58:35 +04:00
return list + prio2band [ skb - > priority & TC_PRIO_MAX ] ;
}
2005-04-17 02:20:36 +04:00
2005-06-19 09:58:53 +04:00
static int pfifo_fast_enqueue ( struct sk_buff * skb , struct Qdisc * qdisc )
2005-06-19 09:58:35 +04:00
{
struct sk_buff_head * list = prio2list ( skb , qdisc ) ;
2005-04-17 02:20:36 +04:00
2005-06-19 09:58:15 +04:00
if ( skb_queue_len ( list ) < qdisc - > dev - > tx_queue_len ) {
2005-04-17 02:20:36 +04:00
qdisc - > q . qlen + + ;
2005-06-19 09:58:15 +04:00
return __qdisc_enqueue_tail ( skb , qdisc , list ) ;
2005-04-17 02:20:36 +04:00
}
2005-06-19 09:58:15 +04:00
return qdisc_drop ( skb , qdisc ) ;
2005-04-17 02:20:36 +04:00
}
2005-06-19 09:58:53 +04:00
static struct sk_buff * pfifo_fast_dequeue ( struct Qdisc * qdisc )
2005-04-17 02:20:36 +04:00
{
int prio ;
struct sk_buff_head * list = qdisc_priv ( qdisc ) ;
2005-07-19 00:30:53 +04:00
for ( prio = 0 ; prio < PFIFO_FAST_BANDS ; prio + + ) {
if ( ! skb_queue_empty ( list + prio ) ) {
2005-04-17 02:20:36 +04:00
qdisc - > q . qlen - - ;
2005-07-19 00:30:53 +04:00
return __qdisc_dequeue_head ( qdisc , list + prio ) ;
2005-04-17 02:20:36 +04:00
}
}
2005-06-19 09:58:53 +04:00
2005-04-17 02:20:36 +04:00
return NULL ;
}
2005-06-19 09:58:53 +04:00
static int pfifo_fast_requeue ( struct sk_buff * skb , struct Qdisc * qdisc )
2005-04-17 02:20:36 +04:00
{
qdisc - > q . qlen + + ;
2005-06-19 09:58:35 +04:00
return __qdisc_requeue ( skb , qdisc , prio2list ( skb , qdisc ) ) ;
2005-04-17 02:20:36 +04:00
}
2005-06-19 09:58:53 +04:00
static void pfifo_fast_reset ( struct Qdisc * qdisc )
2005-04-17 02:20:36 +04:00
{
int prio ;
struct sk_buff_head * list = qdisc_priv ( qdisc ) ;
2005-06-19 09:58:53 +04:00
for ( prio = 0 ; prio < PFIFO_FAST_BANDS ; prio + + )
2005-06-19 09:58:15 +04:00
__qdisc_reset_queue ( qdisc , list + prio ) ;
qdisc - > qstats . backlog = 0 ;
2005-04-17 02:20:36 +04:00
qdisc - > q . qlen = 0 ;
}
static int pfifo_fast_dump ( struct Qdisc * qdisc , struct sk_buff * skb )
{
2005-06-19 09:58:53 +04:00
struct tc_prio_qopt opt = { . bands = PFIFO_FAST_BANDS } ;
2005-04-17 02:20:36 +04:00
memcpy ( & opt . priomap , prio2band , TC_PRIO_MAX + 1 ) ;
RTA_PUT ( skb , TCA_OPTIONS , sizeof ( opt ) , & opt ) ;
return skb - > len ;
rtattr_failure :
return - 1 ;
}
static int pfifo_fast_init ( struct Qdisc * qdisc , struct rtattr * opt )
{
2005-06-19 09:58:53 +04:00
int prio ;
2005-04-17 02:20:36 +04:00
struct sk_buff_head * list = qdisc_priv ( qdisc ) ;
2005-06-19 09:58:53 +04:00
for ( prio = 0 ; prio < PFIFO_FAST_BANDS ; prio + + )
skb_queue_head_init ( list + prio ) ;
2005-04-17 02:20:36 +04:00
return 0 ;
}
static struct Qdisc_ops pfifo_fast_ops = {
. id = " pfifo_fast " ,
2005-06-19 09:58:53 +04:00
. priv_size = PFIFO_FAST_BANDS * sizeof ( struct sk_buff_head ) ,
2005-04-17 02:20:36 +04:00
. enqueue = pfifo_fast_enqueue ,
. dequeue = pfifo_fast_dequeue ,
. requeue = pfifo_fast_requeue ,
. init = pfifo_fast_init ,
. reset = pfifo_fast_reset ,
. dump = pfifo_fast_dump ,
. owner = THIS_MODULE ,
} ;
2005-07-06 01:15:09 +04:00
struct Qdisc * qdisc_alloc ( struct net_device * dev , struct Qdisc_ops * ops )
2005-04-17 02:20:36 +04:00
{
void * p ;
struct Qdisc * sch ;
2005-07-06 01:15:09 +04:00
unsigned int size ;
int err = - ENOBUFS ;
2005-04-17 02:20:36 +04:00
/* ensure that the Qdisc and the private data are 32-byte aligned */
2005-07-06 01:15:09 +04:00
size = QDISC_ALIGN ( sizeof ( * sch ) ) ;
size + = ops - > priv_size + ( QDISC_ALIGNTO - 1 ) ;
2005-04-17 02:20:36 +04:00
2006-07-22 01:51:30 +04:00
p = kzalloc ( size , GFP_KERNEL ) ;
2005-04-17 02:20:36 +04:00
if ( ! p )
2005-07-06 01:15:09 +04:00
goto errout ;
sch = ( struct Qdisc * ) QDISC_ALIGN ( ( unsigned long ) p ) ;
sch - > padded = ( char * ) sch - ( char * ) p ;
2005-04-17 02:20:36 +04:00
INIT_LIST_HEAD ( & sch - > list ) ;
skb_queue_head_init ( & sch - > q ) ;
sch - > ops = ops ;
sch - > enqueue = ops - > enqueue ;
sch - > dequeue = ops - > dequeue ;
sch - > dev = dev ;
dev_hold ( dev ) ;
sch - > stats_lock = & dev - > queue_lock ;
atomic_set ( & sch - > refcnt , 1 ) ;
2005-07-06 01:15:09 +04:00
return sch ;
errout :
return ERR_PTR ( - err ) ;
}
2006-11-30 04:35:18 +03:00
struct Qdisc * qdisc_create_dflt ( struct net_device * dev , struct Qdisc_ops * ops ,
unsigned int parentid )
2005-07-06 01:15:09 +04:00
{
struct Qdisc * sch ;
2007-02-09 17:25:16 +03:00
2005-07-06 01:15:09 +04:00
sch = qdisc_alloc ( dev , ops ) ;
if ( IS_ERR ( sch ) )
goto errout ;
2006-11-30 04:35:18 +03:00
sch - > parent = parentid ;
2005-07-06 01:15:09 +04:00
2005-04-17 02:20:36 +04:00
if ( ! ops - > init | | ops - > init ( sch , NULL ) = = 0 )
return sch ;
2005-08-23 21:12:44 +04:00
qdisc_destroy ( sch ) ;
2005-07-06 01:15:09 +04:00
errout :
2005-04-17 02:20:36 +04:00
return NULL ;
}
/* Under dev->queue_lock and BH! */
void qdisc_reset ( struct Qdisc * qdisc )
{
struct Qdisc_ops * ops = qdisc - > ops ;
if ( ops - > reset )
ops - > reset ( qdisc ) ;
}
2007-02-09 17:25:16 +03:00
/* this is the rcu callback function to clean up a qdisc when there
2005-04-17 02:20:36 +04:00
* are no further references to it */
static void __qdisc_destroy ( struct rcu_head * head )
{
struct Qdisc * qdisc = container_of ( head , struct Qdisc , q_rcu ) ;
kfree ( ( char * ) qdisc - qdisc - > padded ) ;
}
/* Under dev->queue_lock and BH! */
void qdisc_destroy ( struct Qdisc * qdisc )
{
[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.
The two assumptions were:
- since changes only happen in process context, read_lock doesn't need
bottem half protection. Now invalid since destruction of inner qdiscs,
classifiers, actions and estimators happens in the RCU callback unless
they're manually deleted, resulting in dead-locks when read_lock in
process context is interrupted by write_lock_bh in bottem half context.
- since changes only happen under the RTNL, no additional locking is
necessary for data not used during packet processing (f.e. u32_list).
Again, since destruction now happens in the RCU callback, this assumption
is not valid anymore, causing races while using this data, which can
result in corruption or use-after-free.
Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-09-28 03:45:45 +04:00
struct Qdisc_ops * ops = qdisc - > ops ;
2005-04-17 02:20:36 +04:00
if ( qdisc - > flags & TCQ_F_BUILTIN | |
[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.
The two assumptions were:
- since changes only happen in process context, read_lock doesn't need
bottem half protection. Now invalid since destruction of inner qdiscs,
classifiers, actions and estimators happens in the RCU callback unless
they're manually deleted, resulting in dead-locks when read_lock in
process context is interrupted by write_lock_bh in bottem half context.
- since changes only happen under the RTNL, no additional locking is
necessary for data not used during packet processing (f.e. u32_list).
Again, since destruction now happens in the RCU callback, this assumption
is not valid anymore, causing races while using this data, which can
result in corruption or use-after-free.
Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-09-28 03:45:45 +04:00
! atomic_dec_and_test ( & qdisc - > refcnt ) )
2005-04-17 02:20:36 +04:00
return ;
[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.
The two assumptions were:
- since changes only happen in process context, read_lock doesn't need
bottem half protection. Now invalid since destruction of inner qdiscs,
classifiers, actions and estimators happens in the RCU callback unless
they're manually deleted, resulting in dead-locks when read_lock in
process context is interrupted by write_lock_bh in bottem half context.
- since changes only happen under the RTNL, no additional locking is
necessary for data not used during packet processing (f.e. u32_list).
Again, since destruction now happens in the RCU callback, this assumption
is not valid anymore, causing races while using this data, which can
result in corruption or use-after-free.
Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-09-28 03:45:45 +04:00
list_del ( & qdisc - > list ) ;
# ifdef CONFIG_NET_ESTIMATOR
gen_kill_estimator ( & qdisc - > bstats , & qdisc - > rate_est ) ;
# endif
if ( ops - > reset )
ops - > reset ( qdisc ) ;
if ( ops - > destroy )
ops - > destroy ( qdisc ) ;
2005-04-17 02:20:36 +04:00
[NET_SCHED]: Fix fallout from dev->qdisc RCU change
The move of qdisc destruction to a rcu callback broke locking in the
entire qdisc layer by invalidating previously valid assumptions about
the context in which changes to the qdisc tree occur.
The two assumptions were:
- since changes only happen in process context, read_lock doesn't need
bottem half protection. Now invalid since destruction of inner qdiscs,
classifiers, actions and estimators happens in the RCU callback unless
they're manually deleted, resulting in dead-locks when read_lock in
process context is interrupted by write_lock_bh in bottem half context.
- since changes only happen under the RTNL, no additional locking is
necessary for data not used during packet processing (f.e. u32_list).
Again, since destruction now happens in the RCU callback, this assumption
is not valid anymore, causing races while using this data, which can
result in corruption or use-after-free.
Instead of "fixing" this by disabling bottem halfs everywhere and adding
new locks/refcounting, this patch makes these assumptions valid again by
moving destruction back to process context. Since only the dev->qdisc
pointer is protected by RCU, but ->enqueue and the qdisc tree are still
protected by dev->qdisc_lock, destruction of the tree can be performed
immediately and only the final free needs to happen in the rcu callback
to make sure dev_queue_xmit doesn't access already freed memory.
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
2006-09-28 03:45:45 +04:00
module_put ( ops - > owner ) ;
dev_put ( qdisc - > dev ) ;
2005-04-17 02:20:36 +04:00
call_rcu ( & qdisc - > q_rcu , __qdisc_destroy ) ;
}
void dev_activate ( struct net_device * dev )
{
/* No queueing discipline is attached to device;
create default one i . e . pfifo_fast for devices ,
which need queueing and noqueue_qdisc for
virtual interfaces
*/
if ( dev - > qdisc_sleeping = = & noop_qdisc ) {
struct Qdisc * qdisc ;
if ( dev - > tx_queue_len ) {
2006-11-30 04:35:18 +03:00
qdisc = qdisc_create_dflt ( dev , & pfifo_fast_ops ,
TC_H_ROOT ) ;
2005-04-17 02:20:36 +04:00
if ( qdisc = = NULL ) {
printk ( KERN_INFO " %s: activation failed \n " , dev - > name ) ;
return ;
}
list_add_tail ( & qdisc - > list , & dev - > qdisc_list ) ;
} else {
qdisc = & noqueue_qdisc ;
}
dev - > qdisc_sleeping = qdisc ;
}
2005-05-04 03:18:52 +04:00
if ( ! netif_carrier_ok ( dev ) )
/* Delay activation until next carrier-on event */
return ;
2005-04-17 02:20:36 +04:00
spin_lock_bh ( & dev - > queue_lock ) ;
rcu_assign_pointer ( dev - > qdisc , dev - > qdisc_sleeping ) ;
if ( dev - > qdisc ! = & noqueue_qdisc ) {
dev - > trans_start = jiffies ;
dev_watchdog_up ( dev ) ;
}
spin_unlock_bh ( & dev - > queue_lock ) ;
}
void dev_deactivate ( struct net_device * dev )
{
struct Qdisc * qdisc ;
spin_lock_bh ( & dev - > queue_lock ) ;
qdisc = dev - > qdisc ;
dev - > qdisc = & noop_qdisc ;
qdisc_reset ( qdisc ) ;
spin_unlock_bh ( & dev - > queue_lock ) ;
dev_watchdog_down ( dev ) ;
2006-06-22 13:28:18 +04:00
/* Wait for outstanding dev_queue_xmit calls. */
synchronize_rcu ( ) ;
2005-04-17 02:20:36 +04:00
2006-06-22 13:28:18 +04:00
/* Wait for outstanding qdisc_run calls. */
while ( test_bit ( __LINK_STATE_QDISC_RUNNING , & dev - > state ) )
yield ( ) ;
2006-06-22 13:57:17 +04:00
if ( dev - > gso_skb ) {
kfree_skb ( dev - > gso_skb ) ;
dev - > gso_skb = NULL ;
}
2005-04-17 02:20:36 +04:00
}
void dev_init_scheduler ( struct net_device * dev )
{
qdisc_lock_tree ( dev ) ;
dev - > qdisc = & noop_qdisc ;
dev - > qdisc_sleeping = & noop_qdisc ;
INIT_LIST_HEAD ( & dev - > qdisc_list ) ;
qdisc_unlock_tree ( dev ) ;
dev_watchdog_init ( dev ) ;
}
void dev_shutdown ( struct net_device * dev )
{
struct Qdisc * qdisc ;
qdisc_lock_tree ( dev ) ;
qdisc = dev - > qdisc_sleeping ;
dev - > qdisc = & noop_qdisc ;
dev - > qdisc_sleeping = & noop_qdisc ;
qdisc_destroy ( qdisc ) ;
# if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
2007-02-09 17:25:16 +03:00
if ( ( qdisc = dev - > qdisc_ingress ) ! = NULL ) {
2005-04-17 02:20:36 +04:00
dev - > qdisc_ingress = NULL ;
qdisc_destroy ( qdisc ) ;
2007-02-09 17:25:16 +03:00
}
2005-04-17 02:20:36 +04:00
# endif
BUG_TRAP ( ! timer_pending ( & dev - > watchdog_timer ) ) ;
qdisc_unlock_tree ( dev ) ;
}
2005-08-12 02:32:53 +04:00
EXPORT_SYMBOL ( netif_carrier_on ) ;
EXPORT_SYMBOL ( netif_carrier_off ) ;
2005-04-17 02:20:36 +04:00
EXPORT_SYMBOL ( noop_qdisc ) ;
EXPORT_SYMBOL ( qdisc_create_dflt ) ;
EXPORT_SYMBOL ( qdisc_destroy ) ;
EXPORT_SYMBOL ( qdisc_reset ) ;
EXPORT_SYMBOL ( qdisc_lock_tree ) ;
EXPORT_SYMBOL ( qdisc_unlock_tree ) ;