2005-04-17 02:20:36 +04:00
/*
* net / sched / sch_sfq . c Stochastic Fairness Queueing discipline .
*
* This program is free software ; you can redistribute it and / or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation ; either version
* 2 of the License , or ( at your option ) any later version .
*
* Authors : Alexey Kuznetsov , < kuznet @ ms2 . inr . ac . ru >
*/
# include <linux/module.h>
# include <linux/types.h>
# include <linux/kernel.h>
# include <linux/jiffies.h>
# include <linux/string.h>
# include <linux/in.h>
# include <linux/errno.h>
# include <linux/init.h>
# include <linux/ipv6.h>
# include <linux/skbuff.h>
2007-10-01 04:51:33 +04:00
# include <linux/jhash.h>
2007-07-03 09:49:07 +04:00
# include <net/ip.h>
# include <net/netlink.h>
2005-04-17 02:20:36 +04:00
# include <net/pkt_sched.h>
/* Stochastic Fairness Queuing algorithm.
= = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
Source :
Paul E . McKenney " Stochastic Fairness Queuing " ,
IEEE INFOCOMM ' 90 Proceedings , San Francisco , 1990.
Paul E . McKenney " Stochastic Fairness Queuing " ,
" Interworking: Research and Experience " , v .2 , 1991 , p .113 - 131.
See also :
M . Shreedhar and George Varghese " Efficient Fair
Queuing using Deficit Round Robin " , Proc. SIGCOMM 95.
2007-02-09 17:25:16 +03:00
This is not the thing that is usually called ( W ) FQ nowadays .
2005-04-17 02:20:36 +04:00
It does not use any timestamp mechanism , but instead
processes queues in round - robin order .
ADVANTAGE :
- It is very cheap . Both CPU and memory requirements are minimal .
DRAWBACKS :
2007-02-09 17:25:16 +03:00
- " Stochastic " - > It is not 100 % fair .
2005-04-17 02:20:36 +04:00
When hash collisions occur , several flows are considered as one .
- " Round-robin " - > It introduces larger delays than virtual clock
based schemes , and should not be used for isolating interactive
traffic from non - interactive . It means , that this scheduler
should be used as leaf of CBQ or P3 , which put interactive traffic
to higher priority band .
We still need true WFQ for top level CSZ , but using WFQ
for the best effort traffic is absolutely pointless :
SFQ is superior for this purpose .
IMPLEMENTATION :
This implementation limits maximal queue length to 128 ;
maximal mtu to 2 ^ 15 - 1 ; number of hash buckets to 1024.
The only goal of this restrictions was that all data
fit into one 4 K page : - ) . Struct sfq_sched_data is
organized in anti - cache manner : all the data for a bucket
are scattered over different locations . This is not good ,
but it allowed me to put it into 4 K .
It is easy to increase these values , but not in flight . */
# define SFQ_DEPTH 128
# define SFQ_HASH_DIVISOR 1024
/* This type should contain at least SFQ_DEPTH*2 values */
typedef unsigned char sfq_index ;
struct sfq_head
{
sfq_index next ;
sfq_index prev ;
} ;
struct sfq_sched_data
{
/* Parameters */
int perturb_period ;
unsigned quantum ; /* Allotment per round: MUST BE >= MTU */
int limit ;
/* Variables */
struct timer_list perturb_timer ;
2007-10-01 04:51:33 +04:00
u32 perturbation ;
2005-04-17 02:20:36 +04:00
sfq_index tail ; /* Index of current slot in round */
sfq_index max_depth ; /* Maximal depth */
sfq_index ht [ SFQ_HASH_DIVISOR ] ; /* Hash table */
sfq_index next [ SFQ_DEPTH ] ; /* Active slots link */
short allot [ SFQ_DEPTH ] ; /* Current allotment per slot */
unsigned short hash [ SFQ_DEPTH ] ; /* Hash value indexed by slots */
struct sk_buff_head qs [ SFQ_DEPTH ] ; /* Slot queue */
struct sfq_head dep [ SFQ_DEPTH * 2 ] ; /* Linked list of slots, indexed by depth */
} ;
static __inline__ unsigned sfq_fold_hash ( struct sfq_sched_data * q , u32 h , u32 h1 )
{
2007-10-01 04:51:33 +04:00
return jhash_2words ( h , h1 , q - > perturbation ) & ( SFQ_HASH_DIVISOR - 1 ) ;
2005-04-17 02:20:36 +04:00
}
static unsigned sfq_hash ( struct sfq_sched_data * q , struct sk_buff * skb )
{
u32 h , h2 ;
switch ( skb - > protocol ) {
case __constant_htons ( ETH_P_IP ) :
{
2007-04-21 09:47:35 +04:00
const struct iphdr * iph = ip_hdr ( skb ) ;
2005-04-17 02:20:36 +04:00
h = iph - > daddr ;
h2 = iph - > saddr ^ iph - > protocol ;
if ( ! ( iph - > frag_off & htons ( IP_MF | IP_OFFSET ) ) & &
( iph - > protocol = = IPPROTO_TCP | |
iph - > protocol = = IPPROTO_UDP | |
2007-02-08 02:07:43 +03:00
iph - > protocol = = IPPROTO_UDPLITE | |
2006-01-18 00:01:06 +03:00
iph - > protocol = = IPPROTO_SCTP | |
iph - > protocol = = IPPROTO_DCCP | |
2005-04-17 02:20:36 +04:00
iph - > protocol = = IPPROTO_ESP ) )
h2 ^ = * ( ( ( u32 * ) iph ) + iph - > ihl ) ;
break ;
}
case __constant_htons ( ETH_P_IPV6 ) :
{
2007-04-26 04:54:47 +04:00
struct ipv6hdr * iph = ipv6_hdr ( skb ) ;
2005-04-17 02:20:36 +04:00
h = iph - > daddr . s6_addr32 [ 3 ] ;
h2 = iph - > saddr . s6_addr32 [ 3 ] ^ iph - > nexthdr ;
if ( iph - > nexthdr = = IPPROTO_TCP | |
iph - > nexthdr = = IPPROTO_UDP | |
2007-02-08 02:07:43 +03:00
iph - > nexthdr = = IPPROTO_UDPLITE | |
2006-01-18 00:01:06 +03:00
iph - > nexthdr = = IPPROTO_SCTP | |
iph - > nexthdr = = IPPROTO_DCCP | |
2005-04-17 02:20:36 +04:00
iph - > nexthdr = = IPPROTO_ESP )
h2 ^ = * ( u32 * ) & iph [ 1 ] ;
break ;
}
default :
h = ( u32 ) ( unsigned long ) skb - > dst ^ skb - > protocol ;
h2 = ( u32 ) ( unsigned long ) skb - > sk ;
}
return sfq_fold_hash ( q , h , h2 ) ;
}
static inline void sfq_link ( struct sfq_sched_data * q , sfq_index x )
{
sfq_index p , n ;
int d = q - > qs [ x ] . qlen + SFQ_DEPTH ;
p = d ;
n = q - > dep [ d ] . next ;
q - > dep [ x ] . next = n ;
q - > dep [ x ] . prev = p ;
q - > dep [ p ] . next = q - > dep [ n ] . prev = x ;
}
static inline void sfq_dec ( struct sfq_sched_data * q , sfq_index x )
{
sfq_index p , n ;
n = q - > dep [ x ] . next ;
p = q - > dep [ x ] . prev ;
q - > dep [ p ] . next = n ;
q - > dep [ n ] . prev = p ;
if ( n = = p & & q - > max_depth = = q - > qs [ x ] . qlen + 1 )
q - > max_depth - - ;
sfq_link ( q , x ) ;
}
static inline void sfq_inc ( struct sfq_sched_data * q , sfq_index x )
{
sfq_index p , n ;
int d ;
n = q - > dep [ x ] . next ;
p = q - > dep [ x ] . prev ;
q - > dep [ p ] . next = n ;
q - > dep [ n ] . prev = p ;
d = q - > qs [ x ] . qlen ;
if ( q - > max_depth < d )
q - > max_depth = d ;
sfq_link ( q , x ) ;
}
static unsigned int sfq_drop ( struct Qdisc * sch )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
sfq_index d = q - > max_depth ;
struct sk_buff * skb ;
unsigned int len ;
/* Queue is full! Find the longest slot and
drop a packet from it */
if ( d > 1 ) {
sfq_index x = q - > dep [ d + SFQ_DEPTH ] . next ;
skb = q - > qs [ x ] . prev ;
len = skb - > len ;
__skb_unlink ( skb , & q - > qs [ x ] ) ;
kfree_skb ( skb ) ;
sfq_dec ( q , x ) ;
sch - > q . qlen - - ;
sch - > qstats . drops + + ;
2006-03-21 06:01:38 +03:00
sch - > qstats . backlog - = len ;
2005-04-17 02:20:36 +04:00
return len ;
}
if ( d = = 1 ) {
/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
d = q - > next [ q - > tail ] ;
q - > next [ q - > tail ] = q - > next [ d ] ;
q - > allot [ q - > next [ d ] ] + = q - > quantum ;
skb = q - > qs [ d ] . prev ;
len = skb - > len ;
__skb_unlink ( skb , & q - > qs [ d ] ) ;
kfree_skb ( skb ) ;
sfq_dec ( q , d ) ;
sch - > q . qlen - - ;
q - > ht [ q - > hash [ d ] ] = SFQ_DEPTH ;
sch - > qstats . drops + + ;
2006-03-21 06:01:38 +03:00
sch - > qstats . backlog - = len ;
2005-04-17 02:20:36 +04:00
return len ;
}
return 0 ;
}
static int
sfq_enqueue ( struct sk_buff * skb , struct Qdisc * sch )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
unsigned hash = sfq_hash ( q , skb ) ;
sfq_index x ;
x = q - > ht [ hash ] ;
if ( x = = SFQ_DEPTH ) {
q - > ht [ hash ] = x = q - > dep [ SFQ_DEPTH ] . next ;
q - > hash [ x ] = hash ;
}
2007-10-01 04:51:33 +04:00
/* If selected queue has length q->limit, this means that
* all another queues are empty and that we do simple tail drop ,
* i . e . drop _this_ packet .
*/
if ( q - > qs [ x ] . qlen > = q - > limit )
return qdisc_drop ( skb , sch ) ;
2006-03-21 06:01:38 +03:00
sch - > qstats . backlog + = skb - > len ;
2005-04-17 02:20:36 +04:00
__skb_queue_tail ( & q - > qs [ x ] , skb ) ;
sfq_inc ( q , x ) ;
if ( q - > qs [ x ] . qlen = = 1 ) { /* The flow is new */
if ( q - > tail = = SFQ_DEPTH ) { /* It is the first flow */
q - > tail = x ;
q - > next [ x ] = x ;
q - > allot [ x ] = q - > quantum ;
} else {
q - > next [ x ] = q - > next [ q - > tail ] ;
q - > next [ q - > tail ] = x ;
q - > tail = x ;
}
}
2007-09-19 21:42:03 +04:00
if ( + + sch - > q . qlen < = q - > limit ) {
2005-04-17 02:20:36 +04:00
sch - > bstats . bytes + = skb - > len ;
sch - > bstats . packets + + ;
return 0 ;
}
sfq_drop ( sch ) ;
return NET_XMIT_CN ;
}
static int
sfq_requeue ( struct sk_buff * skb , struct Qdisc * sch )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
unsigned hash = sfq_hash ( q , skb ) ;
sfq_index x ;
x = q - > ht [ hash ] ;
if ( x = = SFQ_DEPTH ) {
q - > ht [ hash ] = x = q - > dep [ SFQ_DEPTH ] . next ;
q - > hash [ x ] = hash ;
}
2006-03-21 06:01:38 +03:00
sch - > qstats . backlog + = skb - > len ;
2005-04-17 02:20:36 +04:00
__skb_queue_head ( & q - > qs [ x ] , skb ) ;
2007-10-01 04:51:33 +04:00
/* If selected queue has length q->limit+1, this means that
* all another queues are empty and we do simple tail drop .
* This packet is still requeued at head of queue , tail packet
* is dropped .
*/
if ( q - > qs [ x ] . qlen > q - > limit ) {
skb = q - > qs [ x ] . prev ;
__skb_unlink ( skb , & q - > qs [ x ] ) ;
sch - > qstats . drops + + ;
sch - > qstats . backlog - = skb - > len ;
kfree_skb ( skb ) ;
return NET_XMIT_CN ;
}
2005-04-17 02:20:36 +04:00
sfq_inc ( q , x ) ;
if ( q - > qs [ x ] . qlen = = 1 ) { /* The flow is new */
if ( q - > tail = = SFQ_DEPTH ) { /* It is the first flow */
q - > tail = x ;
q - > next [ x ] = x ;
q - > allot [ x ] = q - > quantum ;
} else {
q - > next [ x ] = q - > next [ q - > tail ] ;
q - > next [ q - > tail ] = x ;
q - > tail = x ;
}
}
2007-09-19 21:42:03 +04:00
if ( + + sch - > q . qlen < = q - > limit ) {
2005-04-17 02:20:36 +04:00
sch - > qstats . requeues + + ;
return 0 ;
}
sch - > qstats . drops + + ;
sfq_drop ( sch ) ;
return NET_XMIT_CN ;
}
static struct sk_buff *
sfq_dequeue ( struct Qdisc * sch )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
struct sk_buff * skb ;
sfq_index a , old_a ;
/* No active slots */
if ( q - > tail = = SFQ_DEPTH )
return NULL ;
a = old_a = q - > next [ q - > tail ] ;
/* Grab packet */
skb = __skb_dequeue ( & q - > qs [ a ] ) ;
sfq_dec ( q , a ) ;
sch - > q . qlen - - ;
2006-03-21 06:01:38 +03:00
sch - > qstats . backlog - = skb - > len ;
2005-04-17 02:20:36 +04:00
/* Is the slot empty? */
if ( q - > qs [ a ] . qlen = = 0 ) {
q - > ht [ q - > hash [ a ] ] = SFQ_DEPTH ;
a = q - > next [ a ] ;
if ( a = = old_a ) {
q - > tail = SFQ_DEPTH ;
return skb ;
}
q - > next [ q - > tail ] = a ;
q - > allot [ a ] + = q - > quantum ;
} else if ( ( q - > allot [ a ] - = skb - > len ) < = 0 ) {
q - > tail = a ;
a = q - > next [ a ] ;
q - > allot [ a ] + = q - > quantum ;
}
return skb ;
}
static void
sfq_reset ( struct Qdisc * sch )
{
struct sk_buff * skb ;
while ( ( skb = sfq_dequeue ( sch ) ) ! = NULL )
kfree_skb ( skb ) ;
}
static void sfq_perturbation ( unsigned long arg )
{
struct Qdisc * sch = ( struct Qdisc * ) arg ;
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
2007-10-01 04:51:33 +04:00
get_random_bytes ( & q - > perturbation , 4 ) ;
2005-04-17 02:20:36 +04:00
2007-10-01 04:51:33 +04:00
if ( q - > perturb_period )
mod_timer ( & q - > perturb_timer , jiffies + q - > perturb_period ) ;
2005-04-17 02:20:36 +04:00
}
static int sfq_change ( struct Qdisc * sch , struct rtattr * opt )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
struct tc_sfq_qopt * ctl = RTA_DATA ( opt ) ;
2006-11-30 04:36:20 +03:00
unsigned int qlen ;
2005-04-17 02:20:36 +04:00
if ( opt - > rta_len < RTA_LENGTH ( sizeof ( * ctl ) ) )
return - EINVAL ;
sch_tree_lock ( sch ) ;
q - > quantum = ctl - > quantum ? : psched_mtu ( sch - > dev ) ;
q - > perturb_period = ctl - > perturb_period * HZ ;
if ( ctl - > limit )
2007-10-01 04:51:33 +04:00
q - > limit = min_t ( u32 , ctl - > limit , SFQ_DEPTH - 1 ) ;
2005-04-17 02:20:36 +04:00
2006-11-30 04:36:20 +03:00
qlen = sch - > q . qlen ;
2007-09-19 21:42:03 +04:00
while ( sch - > q . qlen > q - > limit )
2005-04-17 02:20:36 +04:00
sfq_drop ( sch ) ;
2006-11-30 04:36:20 +03:00
qdisc_tree_decrease_qlen ( sch , qlen - sch - > q . qlen ) ;
2005-04-17 02:20:36 +04:00
del_timer ( & q - > perturb_timer ) ;
if ( q - > perturb_period ) {
2007-10-01 04:51:33 +04:00
mod_timer ( & q - > perturb_timer , jiffies + q - > perturb_period ) ;
get_random_bytes ( & q - > perturbation , 4 ) ;
2005-04-17 02:20:36 +04:00
}
sch_tree_unlock ( sch ) ;
return 0 ;
}
static int sfq_init ( struct Qdisc * sch , struct rtattr * opt )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
int i ;
init_timer ( & q - > perturb_timer ) ;
q - > perturb_timer . data = ( unsigned long ) sch ;
q - > perturb_timer . function = sfq_perturbation ;
for ( i = 0 ; i < SFQ_HASH_DIVISOR ; i + + )
q - > ht [ i ] = SFQ_DEPTH ;
for ( i = 0 ; i < SFQ_DEPTH ; i + + ) {
skb_queue_head_init ( & q - > qs [ i ] ) ;
q - > dep [ i + SFQ_DEPTH ] . next = i + SFQ_DEPTH ;
q - > dep [ i + SFQ_DEPTH ] . prev = i + SFQ_DEPTH ;
}
2007-10-01 04:51:33 +04:00
q - > limit = SFQ_DEPTH - 1 ;
2005-04-17 02:20:36 +04:00
q - > max_depth = 0 ;
q - > tail = SFQ_DEPTH ;
if ( opt = = NULL ) {
q - > quantum = psched_mtu ( sch - > dev ) ;
q - > perturb_period = 0 ;
2007-10-01 04:51:33 +04:00
get_random_bytes ( & q - > perturbation , 4 ) ;
2005-04-17 02:20:36 +04:00
} else {
int err = sfq_change ( sch , opt ) ;
if ( err )
return err ;
}
for ( i = 0 ; i < SFQ_DEPTH ; i + + )
sfq_link ( q , i ) ;
return 0 ;
}
static void sfq_destroy ( struct Qdisc * sch )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
del_timer ( & q - > perturb_timer ) ;
}
static int sfq_dump ( struct Qdisc * sch , struct sk_buff * skb )
{
struct sfq_sched_data * q = qdisc_priv ( sch ) ;
2007-04-20 07:29:13 +04:00
unsigned char * b = skb_tail_pointer ( skb ) ;
2005-04-17 02:20:36 +04:00
struct tc_sfq_qopt opt ;
opt . quantum = q - > quantum ;
opt . perturb_period = q - > perturb_period / HZ ;
opt . limit = q - > limit ;
opt . divisor = SFQ_HASH_DIVISOR ;
opt . flows = q - > limit ;
RTA_PUT ( skb , TCA_OPTIONS , sizeof ( opt ) , & opt ) ;
return skb - > len ;
rtattr_failure :
2007-03-26 10:06:12 +04:00
nlmsg_trim ( skb , b ) ;
2005-04-17 02:20:36 +04:00
return - 1 ;
}
static struct Qdisc_ops sfq_qdisc_ops = {
. next = NULL ,
. cl_ops = NULL ,
. id = " sfq " ,
. priv_size = sizeof ( struct sfq_sched_data ) ,
. enqueue = sfq_enqueue ,
. dequeue = sfq_dequeue ,
. requeue = sfq_requeue ,
. drop = sfq_drop ,
. init = sfq_init ,
. reset = sfq_reset ,
. destroy = sfq_destroy ,
. change = NULL ,
. dump = sfq_dump ,
. owner = THIS_MODULE ,
} ;
static int __init sfq_module_init ( void )
{
return register_qdisc ( & sfq_qdisc_ops ) ;
}
2007-02-09 17:25:16 +03:00
static void __exit sfq_module_exit ( void )
2005-04-17 02:20:36 +04:00
{
unregister_qdisc ( & sfq_qdisc_ops ) ;
}
module_init ( sfq_module_init )
module_exit ( sfq_module_exit )
MODULE_LICENSE ( " GPL " ) ;