2005-12-14 10:13:28 +03:00
/*
2008-10-29 07:07:18 +03:00
* TCP CUBIC : Binary Increase Congestion control for TCP v2 .3
2008-03-05 01:17:41 +03:00
* Home page :
* http : //netsrv.csc.ncsu.edu/twiki/bin/view/Main/BIC
2005-12-14 10:13:28 +03:00
* This is from the implementation of CUBIC TCP in
2008-10-29 07:07:18 +03:00
* Sangtae Ha , Injong Rhee and Lisong Xu ,
* " CUBIC: A New TCP-Friendly High-Speed TCP Variant "
* in ACM SIGOPS Operating System Review , July 2008.
2005-12-14 10:13:28 +03:00
* Available from :
2008-10-29 07:07:18 +03:00
* http : //netsrv.csc.ncsu.edu/export/cubic_a_new_tcp_2008.pdf
*
* CUBIC integrates a new slow start algorithm , called HyStart .
* The details of HyStart are presented in
* Sangtae Ha and Injong Rhee ,
* " Taming the Elephants: New TCP Slow Start " , NCSU TechReport 2008.
* Available from :
* http : //netsrv.csc.ncsu.edu/export/hystart_techreport_2008.pdf
*
* All testing results are available from :
* http : //netsrv.csc.ncsu.edu/wiki/index.php/TCP_Testing
2005-12-14 10:13:28 +03:00
*
* Unless CUBIC is enabled and congestion window is large
* this behaves the same as the original Reno .
*/
# include <linux/mm.h>
# include <linux/module.h>
2008-05-01 15:34:28 +04:00
# include <linux/math64.h>
2005-12-14 10:13:28 +03:00
# include <net/tcp.h>
# define BICTCP_BETA_SCALE 1024 / * Scale factor beta calculation
* max_cwnd = snd_cwnd * beta
*/
# define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
2008-10-29 07:07:18 +03:00
/* Two methods of hybrid slow start */
# define HYSTART_ACK_TRAIN 0x1
# define HYSTART_DELAY 0x2
/* Number of delay samples for detecting the increase of delay */
# define HYSTART_MIN_SAMPLES 8
# define HYSTART_DELAY_MIN (2U<<3)
# define HYSTART_DELAY_MAX (16U<<3)
# define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
2007-02-13 00:15:20 +03:00
static int fast_convergence __read_mostly = 1 ;
2008-03-05 01:17:41 +03:00
static int beta __read_mostly = 717 ; /* = 717/1024 (BICTCP_BETA_SCALE) */
2007-06-13 12:03:53 +04:00
static int initial_ssthresh __read_mostly ;
2007-02-13 00:15:20 +03:00
static int bic_scale __read_mostly = 41 ;
static int tcp_friendliness __read_mostly = 1 ;
2005-12-14 10:13:28 +03:00
2008-10-29 07:07:18 +03:00
static int hystart __read_mostly = 1 ;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY ;
static int hystart_low_window __read_mostly = 16 ;
2007-02-13 00:15:20 +03:00
static u32 cube_rtt_scale __read_mostly ;
static u32 beta_scale __read_mostly ;
static u64 cube_factor __read_mostly ;
2005-12-22 06:32:08 +03:00
/* Note parameters that are used for precomputing scale factors are read-only */
2005-12-14 10:13:28 +03:00
module_param ( fast_convergence , int , 0644 ) ;
MODULE_PARM_DESC ( fast_convergence , " turn on/off fast convergence " ) ;
2008-03-05 01:17:41 +03:00
module_param ( beta , int , 0644 ) ;
2005-12-14 10:13:28 +03:00
MODULE_PARM_DESC ( beta , " beta for multiplicative increase " ) ;
module_param ( initial_ssthresh , int , 0644 ) ;
MODULE_PARM_DESC ( initial_ssthresh , " initial value of slow start threshold " ) ;
2005-12-22 06:32:08 +03:00
module_param ( bic_scale , int , 0444 ) ;
2005-12-14 10:13:28 +03:00
MODULE_PARM_DESC ( bic_scale , " scale (scaled by 1024) value for bic function (bic_scale/1024) " ) ;
module_param ( tcp_friendliness , int , 0644 ) ;
MODULE_PARM_DESC ( tcp_friendliness , " turn on/off tcp friendliness " ) ;
2008-10-29 07:07:18 +03:00
module_param ( hystart , int , 0644 ) ;
MODULE_PARM_DESC ( hystart , " turn on/off hybrid slow start algorithm " ) ;
module_param ( hystart_detect , int , 0644 ) ;
MODULE_PARM_DESC ( hystart_detect , " hyrbrid slow start detection mechanisms "
" 1: packet-train 2: delay 3: both packet-train and delay " ) ;
module_param ( hystart_low_window , int , 0644 ) ;
MODULE_PARM_DESC ( hystart_low_window , " lower bound cwnd for hybrid slow start " ) ;
2005-12-14 10:13:28 +03:00
/* BIC TCP Parameters */
struct bictcp {
u32 cnt ; /* increase cwnd by 1 after ACKs */
u32 last_max_cwnd ; /* last maximum snd_cwnd */
u32 loss_cwnd ; /* congestion window at last loss */
u32 last_cwnd ; /* the last snd_cwnd */
u32 last_time ; /* time when updated last_cwnd */
u32 bic_origin_point ; /* origin point of bic function */
u32 bic_K ; /* time to origin point from the beginning of the current epoch */
u32 delay_min ; /* min delay */
u32 epoch_start ; /* beginning of an epoch */
u32 ack_cnt ; /* number of acks */
u32 tcp_cwnd ; /* estimated tcp cwnd */
# define ACK_RATIO_SHIFT 4
2008-10-29 07:07:18 +03:00
u16 delayed_ack ; /* estimate the ratio of Packets/ACKs << 4 */
u8 sample_cnt ; /* number of samples to decide curr_rtt */
u8 found ; /* the exit point is found? */
u32 round_start ; /* beginning of each round */
u32 end_seq ; /* end_seq of the round */
u32 last_jiffies ; /* last time when the ACK spacing is close */
u32 curr_rtt ; /* the minimum rtt of current round */
2005-12-14 10:13:28 +03:00
} ;
static inline void bictcp_reset ( struct bictcp * ca )
{
ca - > cnt = 0 ;
ca - > last_max_cwnd = 0 ;
ca - > loss_cwnd = 0 ;
ca - > last_cwnd = 0 ;
ca - > last_time = 0 ;
ca - > bic_origin_point = 0 ;
ca - > bic_K = 0 ;
ca - > delay_min = 0 ;
ca - > epoch_start = 0 ;
ca - > delayed_ack = 2 < < ACK_RATIO_SHIFT ;
ca - > ack_cnt = 0 ;
ca - > tcp_cwnd = 0 ;
2008-10-29 07:07:18 +03:00
ca - > found = 0 ;
}
static inline void bictcp_hystart_reset ( struct sock * sk )
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
struct bictcp * ca = inet_csk_ca ( sk ) ;
ca - > round_start = ca - > last_jiffies = jiffies ;
ca - > end_seq = tp - > snd_nxt ;
ca - > curr_rtt = 0 ;
ca - > sample_cnt = 0 ;
2005-12-14 10:13:28 +03:00
}
static void bictcp_init ( struct sock * sk )
{
bictcp_reset ( inet_csk_ca ( sk ) ) ;
2008-10-29 07:07:18 +03:00
if ( hystart )
bictcp_hystart_reset ( sk ) ;
if ( ! hystart & & initial_ssthresh )
2005-12-14 10:13:28 +03:00
tcp_sk ( sk ) - > snd_ssthresh = initial_ssthresh ;
}
2007-03-22 22:10:58 +03:00
/* calculate the cubic root of x using a table lookup followed by one
* Newton - Raphson iteration .
* Avg err ~ = 0.195 %
2005-12-14 10:13:28 +03:00
*/
2005-12-22 06:32:36 +03:00
static u32 cubic_root ( u64 a )
2005-12-14 10:13:28 +03:00
{
2007-03-22 22:10:58 +03:00
u32 x , b , shift ;
/*
* cbrt ( x ) MSB values for x MSB values in [ 0. .63 ] .
* Precomputed then refined by hand - Willy Tarreau
*
* For x in [ 0. .63 ] ,
* v = cbrt ( x < < 18 ) - 1
* cbrt ( x ) = ( v [ x ] + 10 ) > > 6
2005-12-22 06:32:36 +03:00
*/
2007-03-22 22:10:58 +03:00
static const u8 v [ ] = {
/* 0x00 */ 0 , 54 , 54 , 54 , 118 , 118 , 118 , 118 ,
/* 0x08 */ 123 , 129 , 134 , 138 , 143 , 147 , 151 , 156 ,
/* 0x10 */ 157 , 161 , 164 , 168 , 170 , 173 , 176 , 179 ,
/* 0x18 */ 181 , 185 , 187 , 190 , 192 , 194 , 197 , 199 ,
/* 0x20 */ 200 , 202 , 204 , 206 , 209 , 211 , 213 , 215 ,
/* 0x28 */ 217 , 219 , 221 , 222 , 224 , 225 , 227 , 229 ,
/* 0x30 */ 231 , 232 , 234 , 236 , 237 , 239 , 240 , 242 ,
/* 0x38 */ 244 , 245 , 246 , 248 , 250 , 251 , 252 , 254 ,
} ;
b = fls64 ( a ) ;
if ( b < 7 ) {
/* a in [0..63] */
return ( ( u32 ) v [ ( u32 ) a ] + 35 ) > > 6 ;
}
b = ( ( b * 84 ) > > 8 ) - 1 ;
shift = ( a > > ( b * 3 ) ) ;
x = ( ( u32 ) ( ( ( u32 ) v [ shift ] + 10 ) < < b ) ) > > 6 ;
/*
* Newton - Raphson iteration
* 2
* x = ( 2 * x + a / x ) / 3
* k + 1 k k
*/
2008-05-01 15:34:28 +04:00
x = ( 2 * x + ( u32 ) div64_u64 ( a , ( u64 ) x * ( u64 ) ( x - 1 ) ) ) ;
2007-03-22 22:10:58 +03:00
x = ( ( x * 341 ) > > 10 ) ;
2005-12-22 06:32:36 +03:00
return x ;
2005-12-14 10:13:28 +03:00
}
/*
* Compute congestion window to use .
*/
static inline void bictcp_update ( struct bictcp * ca , u32 cwnd )
{
2005-12-22 06:32:08 +03:00
u64 offs ;
2008-03-05 01:17:41 +03:00
u32 delta , t , bic_target , max_cnt ;
2005-12-14 10:13:28 +03:00
ca - > ack_cnt + + ; /* count the number of ACKs */
if ( ca - > last_cwnd = = cwnd & &
( s32 ) ( tcp_time_stamp - ca - > last_time ) < = HZ / 32 )
return ;
ca - > last_cwnd = cwnd ;
ca - > last_time = tcp_time_stamp ;
if ( ca - > epoch_start = = 0 ) {
ca - > epoch_start = tcp_time_stamp ; /* record the beginning of an epoch */
ca - > ack_cnt = 1 ; /* start counting */
ca - > tcp_cwnd = cwnd ; /* syn with cubic */
if ( ca - > last_max_cwnd < = cwnd ) {
ca - > bic_K = 0 ;
ca - > bic_origin_point = cwnd ;
} else {
2005-12-22 06:32:08 +03:00
/* Compute new K based on
* ( wmax - cwnd ) * ( srtt > > 3 / HZ ) / c * 2 ^ ( 3 * bictcp_HZ )
*/
ca - > bic_K = cubic_root ( cube_factor
* ( ca - > last_max_cwnd - cwnd ) ) ;
2005-12-14 10:13:28 +03:00
ca - > bic_origin_point = ca - > last_max_cwnd ;
}
}
2007-02-09 17:24:47 +03:00
/* cubic function - calc*/
/* calculate c * time^3 / rtt,
* while considering overflow in calculation of time ^ 3
2005-12-22 06:32:08 +03:00
* ( so time ^ 3 is done by using 64 bit )
2005-12-14 10:13:28 +03:00
* and without the support of division of 64 bit numbers
2005-12-22 06:32:08 +03:00
* ( so all divisions are done by using 32 bit )
2007-02-09 17:24:47 +03:00
* also NOTE the unit of those veriables
* time = ( t - K ) / 2 ^ bictcp_HZ
* c = bic_scale > > 10
2005-12-14 10:13:28 +03:00
* rtt = ( srtt > > 3 ) / HZ
* ! ! ! The following code does not have overflow problems ,
* if the cwnd < 1 million packets ! ! !
2007-02-09 17:24:47 +03:00
*/
2005-12-14 10:13:28 +03:00
/* change the unit from HZ to bictcp_HZ */
2007-02-09 17:24:47 +03:00
t = ( ( tcp_time_stamp + ( ca - > delay_min > > 3 ) - ca - > epoch_start )
2005-12-14 10:13:28 +03:00
< < BICTCP_HZ ) / HZ ;
2007-02-09 17:24:47 +03:00
if ( t < ca - > bic_K ) /* t - K */
2005-12-22 06:32:08 +03:00
offs = ca - > bic_K - t ;
2007-02-09 17:24:47 +03:00
else
offs = t - ca - > bic_K ;
2005-12-14 10:13:28 +03:00
2005-12-22 06:32:08 +03:00
/* c/rtt * (t-K)^3 */
delta = ( cube_rtt_scale * offs * offs * offs ) > > ( 10 + 3 * BICTCP_HZ ) ;
2007-02-09 17:24:47 +03:00
if ( t < ca - > bic_K ) /* below origin*/
bic_target = ca - > bic_origin_point - delta ;
else /* above origin*/
bic_target = ca - > bic_origin_point + delta ;
2005-12-14 10:13:28 +03:00
2007-02-09 17:24:47 +03:00
/* cubic function - calc bictcp_cnt*/
if ( bic_target > cwnd ) {
2005-12-14 10:13:28 +03:00
ca - > cnt = cwnd / ( bic_target - cwnd ) ;
2007-02-09 17:24:47 +03:00
} else {
ca - > cnt = 100 * cwnd ; /* very small increment*/
}
2005-12-14 10:13:28 +03:00
/* TCP Friendly */
if ( tcp_friendliness ) {
2005-12-22 06:32:08 +03:00
u32 scale = beta_scale ;
delta = ( cwnd * scale ) > > 3 ;
2007-02-09 17:24:47 +03:00
while ( ca - > ack_cnt > delta ) { /* update tcp cwnd */
ca - > ack_cnt - = delta ;
ca - > tcp_cwnd + + ;
2005-12-14 10:13:28 +03:00
}
if ( ca - > tcp_cwnd > cwnd ) { /* if bic is slower than tcp */
2005-12-22 06:32:08 +03:00
delta = ca - > tcp_cwnd - cwnd ;
max_cnt = cwnd / delta ;
2005-12-14 10:13:28 +03:00
if ( ca - > cnt > max_cnt )
ca - > cnt = max_cnt ;
}
2007-02-09 17:24:47 +03:00
}
2005-12-14 10:13:28 +03:00
ca - > cnt = ( ca - > cnt < < ACK_RATIO_SHIFT ) / ca - > delayed_ack ;
if ( ca - > cnt = = 0 ) /* cannot be zero */
ca - > cnt = 1 ;
}
2007-12-02 01:47:59 +03:00
static void bictcp_cong_avoid ( struct sock * sk , u32 ack , u32 in_flight )
2005-12-14 10:13:28 +03:00
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
struct bictcp * ca = inet_csk_ca ( sk ) ;
if ( ! tcp_is_cwnd_limited ( sk , in_flight ) )
return ;
2008-10-29 07:07:18 +03:00
if ( tp - > snd_cwnd < = tp - > snd_ssthresh ) {
if ( hystart & & after ( ack , ca - > end_seq ) )
bictcp_hystart_reset ( sk ) ;
2005-12-14 10:13:28 +03:00
tcp_slow_start ( tp ) ;
2008-10-29 07:07:18 +03:00
} else {
2005-12-14 10:13:28 +03:00
bictcp_update ( ca , tp - > snd_cwnd ) ;
2009-02-28 07:44:37 +03:00
tcp_cong_avoid_ai ( tp , ca - > cnt ) ;
2005-12-14 10:13:28 +03:00
}
}
static u32 bictcp_recalc_ssthresh ( struct sock * sk )
{
const struct tcp_sock * tp = tcp_sk ( sk ) ;
struct bictcp * ca = inet_csk_ca ( sk ) ;
ca - > epoch_start = 0 ; /* end of epoch */
/* Wmax and fast convergence */
if ( tp - > snd_cwnd < ca - > last_max_cwnd & & fast_convergence )
ca - > last_max_cwnd = ( tp - > snd_cwnd * ( BICTCP_BETA_SCALE + beta ) )
/ ( 2 * BICTCP_BETA_SCALE ) ;
else
ca - > last_max_cwnd = tp - > snd_cwnd ;
ca - > loss_cwnd = tp - > snd_cwnd ;
return max ( ( tp - > snd_cwnd * beta ) / BICTCP_BETA_SCALE , 2U ) ;
}
static u32 bictcp_undo_cwnd ( struct sock * sk )
{
struct bictcp * ca = inet_csk_ca ( sk ) ;
return max ( tcp_sk ( sk ) - > snd_cwnd , ca - > last_max_cwnd ) ;
}
static void bictcp_state ( struct sock * sk , u8 new_state )
{
2008-10-29 07:07:18 +03:00
if ( new_state = = TCP_CA_Loss ) {
2005-12-14 10:13:28 +03:00
bictcp_reset ( inet_csk_ca ( sk ) ) ;
2008-10-29 07:07:18 +03:00
bictcp_hystart_reset ( sk ) ;
}
}
static void hystart_update ( struct sock * sk , u32 delay )
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
struct bictcp * ca = inet_csk_ca ( sk ) ;
if ( ! ( ca - > found & hystart_detect ) ) {
u32 curr_jiffies = jiffies ;
/* first detection parameter - ack-train detection */
if ( curr_jiffies - ca - > last_jiffies < = msecs_to_jiffies ( 2 ) ) {
ca - > last_jiffies = curr_jiffies ;
if ( curr_jiffies - ca - > round_start > = ca - > delay_min > > 4 )
ca - > found | = HYSTART_ACK_TRAIN ;
}
/* obtain the minimum delay of more than sampling packets */
if ( ca - > sample_cnt < HYSTART_MIN_SAMPLES ) {
if ( ca - > curr_rtt = = 0 | | ca - > curr_rtt > delay )
ca - > curr_rtt = delay ;
ca - > sample_cnt + + ;
} else {
if ( ca - > curr_rtt > ca - > delay_min +
HYSTART_DELAY_THRESH ( ca - > delay_min > > 4 ) )
ca - > found | = HYSTART_DELAY ;
}
/*
* Either one of two conditions are met ,
* we exit from slow start immediately .
*/
if ( ca - > found & hystart_detect )
tp - > snd_ssthresh = tp - > snd_cwnd ;
}
2005-12-14 10:13:28 +03:00
}
/* Track delayed acknowledgment ratio using sliding window
* ratio = ( 15 * ratio + sample ) / 16
*/
2007-07-26 10:49:34 +04:00
static void bictcp_acked ( struct sock * sk , u32 cnt , s32 rtt_us )
2005-12-14 10:13:28 +03:00
{
const struct inet_connection_sock * icsk = inet_csk ( sk ) ;
2008-10-29 07:07:18 +03:00
const struct tcp_sock * tp = tcp_sk ( sk ) ;
2007-07-26 10:50:06 +04:00
struct bictcp * ca = inet_csk_ca ( sk ) ;
u32 delay ;
2005-12-14 10:13:28 +03:00
2007-05-31 11:16:47 +04:00
if ( icsk - > icsk_ca_state = = TCP_CA_Open ) {
2005-12-14 10:13:28 +03:00
cnt - = ca - > delayed_ack > > ACK_RATIO_SHIFT ;
ca - > delayed_ack + = cnt ;
}
2007-07-26 10:50:06 +04:00
/* Some calls are for duplicates without timetamps */
if ( rtt_us < 0 )
return ;
/* Discard delay samples right after fast recovery */
if ( ( s32 ) ( tcp_time_stamp - ca - > epoch_start ) < HZ )
return ;
delay = usecs_to_jiffies ( rtt_us ) < < 3 ;
if ( delay = = 0 )
delay = 1 ;
/* first time call or link delay decreases */
if ( ca - > delay_min = = 0 | | ca - > delay_min > delay )
ca - > delay_min = delay ;
2008-10-29 07:07:18 +03:00
/* hystart triggers when cwnd is larger than some threshold */
if ( hystart & & tp - > snd_cwnd < = tp - > snd_ssthresh & &
tp - > snd_cwnd > = hystart_low_window )
hystart_update ( sk , delay ) ;
2007-07-26 10:50:06 +04:00
}
2005-12-14 10:13:28 +03:00
static struct tcp_congestion_ops cubictcp = {
. init = bictcp_init ,
. ssthresh = bictcp_recalc_ssthresh ,
. cong_avoid = bictcp_cong_avoid ,
. set_state = bictcp_state ,
. undo_cwnd = bictcp_undo_cwnd ,
. pkts_acked = bictcp_acked ,
. owner = THIS_MODULE ,
. name = " cubic " ,
} ;
static int __init cubictcp_register ( void )
{
2006-08-26 04:10:33 +04:00
BUILD_BUG_ON ( sizeof ( struct bictcp ) > ICSK_CA_PRIV_SIZE ) ;
2005-12-22 06:32:08 +03:00
/* Precompute a bunch of the scaling factors that are used per-packet
* based on SRTT of 100 ms
*/
beta_scale = 8 * ( BICTCP_BETA_SCALE + beta ) / 3 / ( BICTCP_BETA_SCALE - beta ) ;
2006-10-26 10:04:12 +04:00
cube_rtt_scale = ( bic_scale * 10 ) ; /* 1024*c/rtt */
2005-12-22 06:32:08 +03:00
/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
* so K = cubic_root ( ( wmax - cwnd ) * rtt / c )
* the unit of K is bictcp_HZ = 2 ^ 10 , not HZ
*
* c = bic_scale > > 10
* rtt = 100 ms
*
* the following code has been designed and tested for
* cwnd < 1 million packets
* RTT < 100 seconds
* HZ < 1 , 000 , 00 ( corresponding to 10 nano - second )
*/
/* 1/c * 2^2*bictcp_HZ * srtt */
cube_factor = 1ull < < ( 10 + 3 * BICTCP_HZ ) ; /* 2^40 */
/* divide by bic_scale and by constant Srtt (100ms) */
do_div ( cube_factor , bic_scale * 10 ) ;
2005-12-14 10:13:28 +03:00
return tcp_register_congestion_control ( & cubictcp ) ;
}
static void __exit cubictcp_unregister ( void )
{
tcp_unregister_congestion_control ( & cubictcp ) ;
}
module_init ( cubictcp_register ) ;
module_exit ( cubictcp_unregister ) ;
MODULE_AUTHOR ( " Sangtae Ha, Stephen Hemminger " ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_DESCRIPTION ( " CUBIC TCP " ) ;
2008-10-29 07:07:18 +03:00
MODULE_VERSION ( " 2.3 " ) ;