2005-12-13 23:13:28 -08:00
/*
2007-03-24 21:34:38 -07:00
* TCP CUBIC : Binary Increase Congestion control for TCP v2 .1
2005-12-13 23:13:28 -08:00
*
* This is from the implementation of CUBIC TCP in
* Injong Rhee , Lisong Xu .
* " CUBIC: A New TCP-Friendly High-Speed TCP Variant
* in PFLDnet 2005
* Available from :
* http : //www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
*
* Unless CUBIC is enabled and congestion window is large
* this behaves the same as the original Reno .
*/
# include <linux/mm.h>
# include <linux/module.h>
# include <net/tcp.h>
2005-12-21 19:32:08 -08:00
# include <asm/div64.h>
2005-12-13 23:13:28 -08:00
# define BICTCP_BETA_SCALE 1024 / * Scale factor beta calculation
* max_cwnd = snd_cwnd * beta
*/
# define BICTCP_B 4 / *
* In binary search ,
* go to point ( max + min ) / N
*/
# define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
2007-02-12 13:15:20 -08:00
static int fast_convergence __read_mostly = 1 ;
static int max_increment __read_mostly = 16 ;
static int beta __read_mostly = 819 ; /* = 819/1024 (BICTCP_BETA_SCALE) */
2007-06-13 01:03:53 -07:00
static int initial_ssthresh __read_mostly ;
2007-02-12 13:15:20 -08:00
static int bic_scale __read_mostly = 41 ;
static int tcp_friendliness __read_mostly = 1 ;
2005-12-13 23:13:28 -08:00
2007-02-12 13:15:20 -08:00
static u32 cube_rtt_scale __read_mostly ;
static u32 beta_scale __read_mostly ;
static u64 cube_factor __read_mostly ;
2005-12-21 19:32:08 -08:00
/* Note parameters that are used for precomputing scale factors are read-only */
2005-12-13 23:13:28 -08:00
module_param ( fast_convergence , int , 0644 ) ;
MODULE_PARM_DESC ( fast_convergence , " turn on/off fast convergence " ) ;
module_param ( max_increment , int , 0644 ) ;
MODULE_PARM_DESC ( max_increment , " Limit on increment allowed during binary search " ) ;
2005-12-21 19:32:08 -08:00
module_param ( beta , int , 0444 ) ;
2005-12-13 23:13:28 -08:00
MODULE_PARM_DESC ( beta , " beta for multiplicative increase " ) ;
module_param ( initial_ssthresh , int , 0644 ) ;
MODULE_PARM_DESC ( initial_ssthresh , " initial value of slow start threshold " ) ;
2005-12-21 19:32:08 -08:00
module_param ( bic_scale , int , 0444 ) ;
2005-12-13 23:13:28 -08:00
MODULE_PARM_DESC ( bic_scale , " scale (scaled by 1024) value for bic function (bic_scale/1024) " ) ;
module_param ( tcp_friendliness , int , 0644 ) ;
MODULE_PARM_DESC ( tcp_friendliness , " turn on/off tcp friendliness " ) ;
/* BIC TCP Parameters */
struct bictcp {
u32 cnt ; /* increase cwnd by 1 after ACKs */
u32 last_max_cwnd ; /* last maximum snd_cwnd */
u32 loss_cwnd ; /* congestion window at last loss */
u32 last_cwnd ; /* the last snd_cwnd */
u32 last_time ; /* time when updated last_cwnd */
u32 bic_origin_point ; /* origin point of bic function */
u32 bic_K ; /* time to origin point from the beginning of the current epoch */
u32 delay_min ; /* min delay */
u32 epoch_start ; /* beginning of an epoch */
u32 ack_cnt ; /* number of acks */
u32 tcp_cwnd ; /* estimated tcp cwnd */
# define ACK_RATIO_SHIFT 4
u32 delayed_ack ; /* estimate the ratio of Packets/ACKs << 4 */
} ;
static inline void bictcp_reset ( struct bictcp * ca )
{
ca - > cnt = 0 ;
ca - > last_max_cwnd = 0 ;
ca - > loss_cwnd = 0 ;
ca - > last_cwnd = 0 ;
ca - > last_time = 0 ;
ca - > bic_origin_point = 0 ;
ca - > bic_K = 0 ;
ca - > delay_min = 0 ;
ca - > epoch_start = 0 ;
ca - > delayed_ack = 2 < < ACK_RATIO_SHIFT ;
ca - > ack_cnt = 0 ;
ca - > tcp_cwnd = 0 ;
}
static void bictcp_init ( struct sock * sk )
{
bictcp_reset ( inet_csk_ca ( sk ) ) ;
if ( initial_ssthresh )
tcp_sk ( sk ) - > snd_ssthresh = initial_ssthresh ;
}
2007-03-22 12:10:58 -07:00
/* calculate the cubic root of x using a table lookup followed by one
* Newton - Raphson iteration .
* Avg err ~ = 0.195 %
2005-12-13 23:13:28 -08:00
*/
2005-12-21 19:32:36 -08:00
static u32 cubic_root ( u64 a )
2005-12-13 23:13:28 -08:00
{
2007-03-22 12:10:58 -07:00
u32 x , b , shift ;
/*
* cbrt ( x ) MSB values for x MSB values in [ 0. .63 ] .
* Precomputed then refined by hand - Willy Tarreau
*
* For x in [ 0. .63 ] ,
* v = cbrt ( x < < 18 ) - 1
* cbrt ( x ) = ( v [ x ] + 10 ) > > 6
2005-12-21 19:32:36 -08:00
*/
2007-03-22 12:10:58 -07:00
static const u8 v [ ] = {
/* 0x00 */ 0 , 54 , 54 , 54 , 118 , 118 , 118 , 118 ,
/* 0x08 */ 123 , 129 , 134 , 138 , 143 , 147 , 151 , 156 ,
/* 0x10 */ 157 , 161 , 164 , 168 , 170 , 173 , 176 , 179 ,
/* 0x18 */ 181 , 185 , 187 , 190 , 192 , 194 , 197 , 199 ,
/* 0x20 */ 200 , 202 , 204 , 206 , 209 , 211 , 213 , 215 ,
/* 0x28 */ 217 , 219 , 221 , 222 , 224 , 225 , 227 , 229 ,
/* 0x30 */ 231 , 232 , 234 , 236 , 237 , 239 , 240 , 242 ,
/* 0x38 */ 244 , 245 , 246 , 248 , 250 , 251 , 252 , 254 ,
} ;
b = fls64 ( a ) ;
if ( b < 7 ) {
/* a in [0..63] */
return ( ( u32 ) v [ ( u32 ) a ] + 35 ) > > 6 ;
}
b = ( ( b * 84 ) > > 8 ) - 1 ;
shift = ( a > > ( b * 3 ) ) ;
x = ( ( u32 ) ( ( ( u32 ) v [ shift ] + 10 ) < < b ) ) > > 6 ;
/*
* Newton - Raphson iteration
* 2
* x = ( 2 * x + a / x ) / 3
* k + 1 k k
*/
x = ( 2 * x + ( u32 ) div64_64 ( a , ( u64 ) x * ( u64 ) ( x - 1 ) ) ) ;
x = ( ( x * 341 ) > > 10 ) ;
2005-12-21 19:32:36 -08:00
return x ;
2005-12-13 23:13:28 -08:00
}
/*
* Compute congestion window to use .
*/
static inline void bictcp_update ( struct bictcp * ca , u32 cwnd )
{
2005-12-21 19:32:08 -08:00
u64 offs ;
u32 delta , t , bic_target , min_cnt , max_cnt ;
2005-12-13 23:13:28 -08:00
ca - > ack_cnt + + ; /* count the number of ACKs */
if ( ca - > last_cwnd = = cwnd & &
( s32 ) ( tcp_time_stamp - ca - > last_time ) < = HZ / 32 )
return ;
ca - > last_cwnd = cwnd ;
ca - > last_time = tcp_time_stamp ;
if ( ca - > epoch_start = = 0 ) {
ca - > epoch_start = tcp_time_stamp ; /* record the beginning of an epoch */
ca - > ack_cnt = 1 ; /* start counting */
ca - > tcp_cwnd = cwnd ; /* syn with cubic */
if ( ca - > last_max_cwnd < = cwnd ) {
ca - > bic_K = 0 ;
ca - > bic_origin_point = cwnd ;
} else {
2005-12-21 19:32:08 -08:00
/* Compute new K based on
* ( wmax - cwnd ) * ( srtt > > 3 / HZ ) / c * 2 ^ ( 3 * bictcp_HZ )
*/
ca - > bic_K = cubic_root ( cube_factor
* ( ca - > last_max_cwnd - cwnd ) ) ;
2005-12-13 23:13:28 -08:00
ca - > bic_origin_point = ca - > last_max_cwnd ;
}
}
2007-02-09 23:24:47 +09:00
/* cubic function - calc*/
/* calculate c * time^3 / rtt,
* while considering overflow in calculation of time ^ 3
2005-12-21 19:32:08 -08:00
* ( so time ^ 3 is done by using 64 bit )
2005-12-13 23:13:28 -08:00
* and without the support of division of 64 bit numbers
2005-12-21 19:32:08 -08:00
* ( so all divisions are done by using 32 bit )
2007-02-09 23:24:47 +09:00
* also NOTE the unit of those veriables
* time = ( t - K ) / 2 ^ bictcp_HZ
* c = bic_scale > > 10
2005-12-13 23:13:28 -08:00
* rtt = ( srtt > > 3 ) / HZ
* ! ! ! The following code does not have overflow problems ,
* if the cwnd < 1 million packets ! ! !
2007-02-09 23:24:47 +09:00
*/
2005-12-13 23:13:28 -08:00
/* change the unit from HZ to bictcp_HZ */
2007-02-09 23:24:47 +09:00
t = ( ( tcp_time_stamp + ( ca - > delay_min > > 3 ) - ca - > epoch_start )
2005-12-13 23:13:28 -08:00
< < BICTCP_HZ ) / HZ ;
2007-02-09 23:24:47 +09:00
if ( t < ca - > bic_K ) /* t - K */
2005-12-21 19:32:08 -08:00
offs = ca - > bic_K - t ;
2007-02-09 23:24:47 +09:00
else
offs = t - ca - > bic_K ;
2005-12-13 23:13:28 -08:00
2005-12-21 19:32:08 -08:00
/* c/rtt * (t-K)^3 */
delta = ( cube_rtt_scale * offs * offs * offs ) > > ( 10 + 3 * BICTCP_HZ ) ;
2007-02-09 23:24:47 +09:00
if ( t < ca - > bic_K ) /* below origin*/
bic_target = ca - > bic_origin_point - delta ;
else /* above origin*/
bic_target = ca - > bic_origin_point + delta ;
2005-12-13 23:13:28 -08:00
2007-02-09 23:24:47 +09:00
/* cubic function - calc bictcp_cnt*/
if ( bic_target > cwnd ) {
2005-12-13 23:13:28 -08:00
ca - > cnt = cwnd / ( bic_target - cwnd ) ;
2007-02-09 23:24:47 +09:00
} else {
ca - > cnt = 100 * cwnd ; /* very small increment*/
}
2005-12-13 23:13:28 -08:00
if ( ca - > delay_min > 0 ) {
/* max increment = Smax * rtt / 0.1 */
min_cnt = ( cwnd * HZ * 8 ) / ( 10 * max_increment * ca - > delay_min ) ;
2007-03-24 21:34:38 -07:00
/* use concave growth when the target is above the origin */
if ( ca - > cnt < min_cnt & & t > = ca - > bic_K )
2005-12-13 23:13:28 -08:00
ca - > cnt = min_cnt ;
}
2007-02-09 23:24:47 +09:00
/* slow start and low utilization */
2005-12-13 23:13:28 -08:00
if ( ca - > loss_cwnd = = 0 ) /* could be aggressive in slow start */
ca - > cnt = 50 ;
/* TCP Friendly */
if ( tcp_friendliness ) {
2005-12-21 19:32:08 -08:00
u32 scale = beta_scale ;
delta = ( cwnd * scale ) > > 3 ;
2007-02-09 23:24:47 +09:00
while ( ca - > ack_cnt > delta ) { /* update tcp cwnd */
ca - > ack_cnt - = delta ;
ca - > tcp_cwnd + + ;
2005-12-13 23:13:28 -08:00
}
if ( ca - > tcp_cwnd > cwnd ) { /* if bic is slower than tcp */
2005-12-21 19:32:08 -08:00
delta = ca - > tcp_cwnd - cwnd ;
max_cnt = cwnd / delta ;
2005-12-13 23:13:28 -08:00
if ( ca - > cnt > max_cnt )
ca - > cnt = max_cnt ;
}
2007-02-09 23:24:47 +09:00
}
2005-12-13 23:13:28 -08:00
ca - > cnt = ( ca - > cnt < < ACK_RATIO_SHIFT ) / ca - > delayed_ack ;
if ( ca - > cnt = = 0 ) /* cannot be zero */
ca - > cnt = 1 ;
}
static void bictcp_cong_avoid ( struct sock * sk , u32 ack ,
2007-07-16 18:35:52 -07:00
u32 in_flight , int data_acked )
2005-12-13 23:13:28 -08:00
{
struct tcp_sock * tp = tcp_sk ( sk ) ;
struct bictcp * ca = inet_csk_ca ( sk ) ;
if ( ! tcp_is_cwnd_limited ( sk , in_flight ) )
return ;
if ( tp - > snd_cwnd < = tp - > snd_ssthresh )
tcp_slow_start ( tp ) ;
else {
bictcp_update ( ca , tp - > snd_cwnd ) ;
/* In dangerous area, increase slowly.
* In theory this is tp - > snd_cwnd + = 1 / tp - > snd_cwnd
*/
if ( tp - > snd_cwnd_cnt > = ca - > cnt ) {
if ( tp - > snd_cwnd < tp - > snd_cwnd_clamp )
tp - > snd_cwnd + + ;
tp - > snd_cwnd_cnt = 0 ;
} else
tp - > snd_cwnd_cnt + + ;
}
}
static u32 bictcp_recalc_ssthresh ( struct sock * sk )
{
const struct tcp_sock * tp = tcp_sk ( sk ) ;
struct bictcp * ca = inet_csk_ca ( sk ) ;
ca - > epoch_start = 0 ; /* end of epoch */
/* Wmax and fast convergence */
if ( tp - > snd_cwnd < ca - > last_max_cwnd & & fast_convergence )
ca - > last_max_cwnd = ( tp - > snd_cwnd * ( BICTCP_BETA_SCALE + beta ) )
/ ( 2 * BICTCP_BETA_SCALE ) ;
else
ca - > last_max_cwnd = tp - > snd_cwnd ;
ca - > loss_cwnd = tp - > snd_cwnd ;
return max ( ( tp - > snd_cwnd * beta ) / BICTCP_BETA_SCALE , 2U ) ;
}
static u32 bictcp_undo_cwnd ( struct sock * sk )
{
struct bictcp * ca = inet_csk_ca ( sk ) ;
return max ( tcp_sk ( sk ) - > snd_cwnd , ca - > last_max_cwnd ) ;
}
static void bictcp_state ( struct sock * sk , u8 new_state )
{
if ( new_state = = TCP_CA_Loss )
bictcp_reset ( inet_csk_ca ( sk ) ) ;
}
/* Track delayed acknowledgment ratio using sliding window
* ratio = ( 15 * ratio + sample ) / 16
*/
2007-07-25 23:49:34 -07:00
static void bictcp_acked ( struct sock * sk , u32 cnt , s32 rtt_us )
2005-12-13 23:13:28 -08:00
{
const struct inet_connection_sock * icsk = inet_csk ( sk ) ;
2007-07-25 23:50:06 -07:00
struct bictcp * ca = inet_csk_ca ( sk ) ;
u32 delay ;
2005-12-13 23:13:28 -08:00
2007-05-31 10:16:47 +03:00
if ( icsk - > icsk_ca_state = = TCP_CA_Open ) {
2005-12-13 23:13:28 -08:00
cnt - = ca - > delayed_ack > > ACK_RATIO_SHIFT ;
ca - > delayed_ack + = cnt ;
}
2007-07-25 23:50:06 -07:00
/* Some calls are for duplicates without timetamps */
if ( rtt_us < 0 )
return ;
/* Discard delay samples right after fast recovery */
if ( ( s32 ) ( tcp_time_stamp - ca - > epoch_start ) < HZ )
return ;
delay = usecs_to_jiffies ( rtt_us ) < < 3 ;
if ( delay = = 0 )
delay = 1 ;
/* first time call or link delay decreases */
if ( ca - > delay_min = = 0 | | ca - > delay_min > delay )
ca - > delay_min = delay ;
}
2005-12-13 23:13:28 -08:00
static struct tcp_congestion_ops cubictcp = {
. init = bictcp_init ,
. ssthresh = bictcp_recalc_ssthresh ,
. cong_avoid = bictcp_cong_avoid ,
. set_state = bictcp_state ,
. undo_cwnd = bictcp_undo_cwnd ,
. pkts_acked = bictcp_acked ,
. owner = THIS_MODULE ,
. name = " cubic " ,
} ;
static int __init cubictcp_register ( void )
{
2006-08-25 17:10:33 -07:00
BUILD_BUG_ON ( sizeof ( struct bictcp ) > ICSK_CA_PRIV_SIZE ) ;
2005-12-21 19:32:08 -08:00
/* Precompute a bunch of the scaling factors that are used per-packet
* based on SRTT of 100 ms
*/
beta_scale = 8 * ( BICTCP_BETA_SCALE + beta ) / 3 / ( BICTCP_BETA_SCALE - beta ) ;
2006-10-25 23:04:12 -07:00
cube_rtt_scale = ( bic_scale * 10 ) ; /* 1024*c/rtt */
2005-12-21 19:32:08 -08:00
/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
* so K = cubic_root ( ( wmax - cwnd ) * rtt / c )
* the unit of K is bictcp_HZ = 2 ^ 10 , not HZ
*
* c = bic_scale > > 10
* rtt = 100 ms
*
* the following code has been designed and tested for
* cwnd < 1 million packets
* RTT < 100 seconds
* HZ < 1 , 000 , 00 ( corresponding to 10 nano - second )
*/
/* 1/c * 2^2*bictcp_HZ * srtt */
cube_factor = 1ull < < ( 10 + 3 * BICTCP_HZ ) ; /* 2^40 */
/* divide by bic_scale and by constant Srtt (100ms) */
do_div ( cube_factor , bic_scale * 10 ) ;
2005-12-13 23:13:28 -08:00
return tcp_register_congestion_control ( & cubictcp ) ;
}
static void __exit cubictcp_unregister ( void )
{
tcp_unregister_congestion_control ( & cubictcp ) ;
}
module_init ( cubictcp_register ) ;
module_exit ( cubictcp_unregister ) ;
MODULE_AUTHOR ( " Sangtae Ha, Stephen Hemminger " ) ;
MODULE_LICENSE ( " GPL " ) ;
MODULE_DESCRIPTION ( " CUBIC TCP " ) ;
2007-03-24 21:34:38 -07:00
MODULE_VERSION ( " 2.1 " ) ;