2005-06-23 23:19:55 +04:00
/*
* Plugable TCP congestion control support and newReno
* congestion control .
* Based on ideas from I / O scheduler suport and Web100 .
*
* Copyright ( C ) 2005 Stephen Hemminger < shemminger @ osdl . org >
*/
# include <linux/module.h>
# include <linux/mm.h>
# include <linux/types.h>
# include <linux/list.h>
# include <net/tcp.h>
2007-03-26 06:21:45 +04:00
int sysctl_tcp_max_ssthresh = 0 ;
2005-06-23 23:19:55 +04:00
static DEFINE_SPINLOCK ( tcp_cong_list_lock ) ;
static LIST_HEAD ( tcp_cong_list ) ;
/* Simple linear search, don't expect many entries! */
static struct tcp_congestion_ops * tcp_ca_find ( const char * name )
{
struct tcp_congestion_ops * e ;
2005-06-24 07:37:36 +04:00
list_for_each_entry_rcu ( e , & tcp_cong_list , list ) {
2005-06-23 23:19:55 +04:00
if ( strcmp ( e - > name , name ) = = 0 )
return e ;
}
return NULL ;
}
/*
2007-02-17 21:07:33 +03:00
* Attach new congestion control algorithm to the list
2005-06-23 23:19:55 +04:00
* of available options .
*/
int tcp_register_congestion_control ( struct tcp_congestion_ops * ca )
{
int ret = 0 ;
/* all algorithms must implement ssthresh and cong_avoid ops */
2006-06-06 04:30:08 +04:00
if ( ! ca - > ssthresh | | ! ca - > cong_avoid ) {
2005-06-23 23:19:55 +04:00
printk ( KERN_ERR " TCP %s does not implement required ops \n " ,
ca - > name ) ;
return - EINVAL ;
}
spin_lock ( & tcp_cong_list_lock ) ;
if ( tcp_ca_find ( ca - > name ) ) {
printk ( KERN_NOTICE " TCP %s already registered \n " , ca - > name ) ;
ret = - EEXIST ;
} else {
2006-09-25 07:11:58 +04:00
list_add_tail_rcu ( & ca - > list , & tcp_cong_list ) ;
2005-06-23 23:19:55 +04:00
printk ( KERN_INFO " TCP %s registered \n " , ca - > name ) ;
}
spin_unlock ( & tcp_cong_list_lock ) ;
return ret ;
}
EXPORT_SYMBOL_GPL ( tcp_register_congestion_control ) ;
/*
* Remove congestion control algorithm , called from
* the module ' s remove function . Module ref counts are used
* to ensure that this can ' t be done till all sockets using
* that method are closed .
*/
void tcp_unregister_congestion_control ( struct tcp_congestion_ops * ca )
{
spin_lock ( & tcp_cong_list_lock ) ;
list_del_rcu ( & ca - > list ) ;
spin_unlock ( & tcp_cong_list_lock ) ;
}
EXPORT_SYMBOL_GPL ( tcp_unregister_congestion_control ) ;
/* Assign choice of congestion control. */
2005-08-10 11:03:31 +04:00
void tcp_init_congestion_control ( struct sock * sk )
2005-06-23 23:19:55 +04:00
{
2005-08-10 11:03:31 +04:00
struct inet_connection_sock * icsk = inet_csk ( sk ) ;
2005-06-23 23:19:55 +04:00
struct tcp_congestion_ops * ca ;
2007-04-24 09:32:11 +04:00
/* if no choice made yet assign the current value set as default */
if ( icsk - > icsk_ca_ops = = & tcp_init_congestion_ops ) {
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( ca , & tcp_cong_list , list ) {
if ( try_module_get ( ca - > owner ) ) {
icsk - > icsk_ca_ops = ca ;
break ;
}
2005-06-24 07:37:36 +04:00
2007-04-24 09:32:11 +04:00
/* fallback to next available */
2005-06-23 23:19:55 +04:00
}
2007-04-24 09:32:11 +04:00
rcu_read_unlock ( ) ;
2005-06-23 23:19:55 +04:00
}
2005-08-10 11:03:31 +04:00
if ( icsk - > icsk_ca_ops - > init )
icsk - > icsk_ca_ops - > init ( sk ) ;
2005-06-23 23:19:55 +04:00
}
/* Manage refcounts on socket close. */
2005-08-10 11:03:31 +04:00
void tcp_cleanup_congestion_control ( struct sock * sk )
2005-06-23 23:19:55 +04:00
{
2005-08-10 11:03:31 +04:00
struct inet_connection_sock * icsk = inet_csk ( sk ) ;
if ( icsk - > icsk_ca_ops - > release )
icsk - > icsk_ca_ops - > release ( sk ) ;
module_put ( icsk - > icsk_ca_ops - > owner ) ;
2005-06-23 23:19:55 +04:00
}
/* Used by sysctl to change default congestion control */
int tcp_set_default_congestion_control ( const char * name )
{
struct tcp_congestion_ops * ca ;
int ret = - ENOENT ;
spin_lock ( & tcp_cong_list_lock ) ;
ca = tcp_ca_find ( name ) ;
# ifdef CONFIG_KMOD
2006-11-10 03:36:36 +03:00
if ( ! ca & & capable ( CAP_SYS_MODULE ) ) {
2005-06-23 23:19:55 +04:00
spin_unlock ( & tcp_cong_list_lock ) ;
request_module ( " tcp_%s " , name ) ;
spin_lock ( & tcp_cong_list_lock ) ;
ca = tcp_ca_find ( name ) ;
}
# endif
if ( ca ) {
2007-04-24 09:26:16 +04:00
ca - > flags | = TCP_CONG_NON_RESTRICTED ; /* default is always allowed */
2005-06-23 23:19:55 +04:00
list_move ( & ca - > list , & tcp_cong_list ) ;
ret = 0 ;
}
spin_unlock ( & tcp_cong_list_lock ) ;
return ret ;
}
2006-11-01 04:31:33 +03:00
/* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default ( void )
{
return tcp_set_default_congestion_control ( CONFIG_DEFAULT_TCP_CONG ) ;
}
late_initcall ( tcp_congestion_default ) ;
2006-11-10 03:32:06 +03:00
/* Build string with list of available congestion control values */
void tcp_get_available_congestion_control ( char * buf , size_t maxlen )
{
struct tcp_congestion_ops * ca ;
size_t offs = 0 ;
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( ca , & tcp_cong_list , list ) {
offs + = snprintf ( buf + offs , maxlen - offs ,
" %s%s " ,
offs = = 0 ? " " : " " , ca - > name ) ;
}
rcu_read_unlock ( ) ;
}
2005-06-23 23:19:55 +04:00
/* Get current default congestion control */
void tcp_get_default_congestion_control ( char * name )
{
struct tcp_congestion_ops * ca ;
/* We will always have reno... */
BUG_ON ( list_empty ( & tcp_cong_list ) ) ;
rcu_read_lock ( ) ;
ca = list_entry ( tcp_cong_list . next , struct tcp_congestion_ops , list ) ;
strncpy ( name , ca - > name , TCP_CA_NAME_MAX ) ;
rcu_read_unlock ( ) ;
}
2006-11-10 03:35:15 +03:00
/* Built list of non-restricted congestion control values */
void tcp_get_allowed_congestion_control ( char * buf , size_t maxlen )
{
struct tcp_congestion_ops * ca ;
size_t offs = 0 ;
* buf = ' \0 ' ;
rcu_read_lock ( ) ;
list_for_each_entry_rcu ( ca , & tcp_cong_list , list ) {
2007-04-24 09:26:16 +04:00
if ( ! ( ca - > flags & TCP_CONG_NON_RESTRICTED ) )
2006-11-10 03:35:15 +03:00
continue ;
offs + = snprintf ( buf + offs , maxlen - offs ,
" %s%s " ,
offs = = 0 ? " " : " " , ca - > name ) ;
}
rcu_read_unlock ( ) ;
}
/* Change list of non-restricted congestion control */
int tcp_set_allowed_congestion_control ( char * val )
{
struct tcp_congestion_ops * ca ;
char * clone , * name ;
int ret = 0 ;
clone = kstrdup ( val , GFP_USER ) ;
if ( ! clone )
return - ENOMEM ;
spin_lock ( & tcp_cong_list_lock ) ;
/* pass 1 check for bad entries */
while ( ( name = strsep ( & clone , " " ) ) & & * name ) {
ca = tcp_ca_find ( name ) ;
if ( ! ca ) {
ret = - ENOENT ;
goto out ;
}
}
2007-04-24 09:26:16 +04:00
/* pass 2 clear old values */
2006-11-10 03:35:15 +03:00
list_for_each_entry_rcu ( ca , & tcp_cong_list , list )
2007-04-24 09:26:16 +04:00
ca - > flags & = ~ TCP_CONG_NON_RESTRICTED ;
2006-11-10 03:35:15 +03:00
/* pass 3 mark as allowed */
while ( ( name = strsep ( & val , " " ) ) & & * name ) {
ca = tcp_ca_find ( name ) ;
WARN_ON ( ! ca ) ;
if ( ca )
2007-04-24 09:26:16 +04:00
ca - > flags | = TCP_CONG_NON_RESTRICTED ;
2006-11-10 03:35:15 +03:00
}
out :
spin_unlock ( & tcp_cong_list_lock ) ;
return ret ;
}
2005-06-24 07:37:36 +04:00
/* Change congestion control for socket */
2005-08-10 11:03:31 +04:00
int tcp_set_congestion_control ( struct sock * sk , const char * name )
2005-06-24 07:37:36 +04:00
{
2005-08-10 11:03:31 +04:00
struct inet_connection_sock * icsk = inet_csk ( sk ) ;
2005-06-24 07:37:36 +04:00
struct tcp_congestion_ops * ca ;
int err = 0 ;
rcu_read_lock ( ) ;
ca = tcp_ca_find ( name ) ;
2007-04-24 09:32:11 +04:00
2006-11-10 03:36:36 +03:00
/* no change asking for existing value */
2005-08-10 11:03:31 +04:00
if ( ca = = icsk - > icsk_ca_ops )
2005-06-24 07:37:36 +04:00
goto out ;
2006-11-10 03:36:36 +03:00
# ifdef CONFIG_KMOD
/* not found attempt to autoload module */
if ( ! ca & & capable ( CAP_SYS_MODULE ) ) {
rcu_read_unlock ( ) ;
request_module ( " tcp_%s " , name ) ;
rcu_read_lock ( ) ;
ca = tcp_ca_find ( name ) ;
}
# endif
2005-06-24 07:37:36 +04:00
if ( ! ca )
err = - ENOENT ;
2007-04-24 09:26:16 +04:00
else if ( ! ( ( ca - > flags & TCP_CONG_NON_RESTRICTED ) | | capable ( CAP_NET_ADMIN ) ) )
2006-11-10 03:35:15 +03:00
err = - EPERM ;
2005-06-24 07:37:36 +04:00
else if ( ! try_module_get ( ca - > owner ) )
err = - EBUSY ;
else {
2005-08-10 11:03:31 +04:00
tcp_cleanup_congestion_control ( sk ) ;
icsk - > icsk_ca_ops = ca ;
2007-04-24 09:32:11 +04:00
if ( sk - > sk_state ! = TCP_CLOSE & & icsk - > icsk_ca_ops - > init )
2005-08-10 11:03:31 +04:00
icsk - > icsk_ca_ops - > init ( sk ) ;
2005-06-24 07:37:36 +04:00
}
out :
rcu_read_unlock ( ) ;
return err ;
}
2006-01-04 03:03:49 +03:00
/*
2007-03-26 06:21:45 +04:00
* Slow start ( exponential increase ) with
* RFC3742 Limited Slow Start ( fast linear increase ) support .
2006-01-04 03:03:49 +03:00
*/
void tcp_slow_start ( struct tcp_sock * tp )
{
2007-03-26 06:21:45 +04:00
int cnt = 0 ;
2006-01-04 03:03:49 +03:00
if ( sysctl_tcp_abc ) {
/* RFC3465: Slow Start
* TCP sender SHOULD increase cwnd by the number of
* previously unacknowledged bytes ACKed by each incoming
* acknowledgment , provided the increase is not more than L
*/
if ( tp - > bytes_acked < tp - > mss_cache )
return ;
}
2007-03-26 06:21:45 +04:00
if ( sysctl_tcp_max_ssthresh > 0 & &
tp - > snd_cwnd > sysctl_tcp_max_ssthresh )
cnt + = sysctl_tcp_max_ssthresh > > 1 ;
else
cnt + = tp - > snd_cwnd ;
/* RFC3465: We MAY increase by 2 if discovered delayed ack */
if ( sysctl_tcp_abc > 1 & & tp - > bytes_acked > = 2 * tp - > mss_cache )
cnt < < = 1 ;
2006-01-04 03:03:49 +03:00
tp - > bytes_acked = 0 ;
2007-03-26 06:21:45 +04:00
tp - > snd_cwnd_cnt + = cnt ;
while ( tp - > snd_cwnd_cnt > = tp - > snd_cwnd ) {
tp - > snd_cwnd_cnt - = tp - > snd_cwnd ;
if ( tp - > snd_cwnd < tp - > snd_cwnd_clamp )
tp - > snd_cwnd + + ;
}
2006-01-04 03:03:49 +03:00
}
EXPORT_SYMBOL_GPL ( tcp_slow_start ) ;
2005-06-23 23:19:55 +04:00
/*
* TCP Reno congestion control
* This is special case used for fallback as well .
*/
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM ' 88 , p . 328.
*/
2005-08-10 11:03:31 +04:00
void tcp_reno_cong_avoid ( struct sock * sk , u32 ack , u32 rtt , u32 in_flight ,
2005-06-23 23:19:55 +04:00
int flag )
{
2005-08-10 11:03:31 +04:00
struct tcp_sock * tp = tcp_sk ( sk ) ;
2005-11-11 03:53:30 +03:00
if ( ! tcp_is_cwnd_limited ( sk , in_flight ) )
2005-06-23 23:19:55 +04:00
return ;
2005-11-11 04:07:24 +03:00
/* In "safe" area, increase. */
2007-02-09 17:24:47 +03:00
if ( tp - > snd_cwnd < = tp - > snd_ssthresh )
2005-11-11 04:07:24 +03:00
tcp_slow_start ( tp ) ;
2005-11-11 04:09:53 +03:00
2007-02-09 17:24:47 +03:00
/* In dangerous area, increase slowly. */
2005-11-11 04:09:53 +03:00
else if ( sysctl_tcp_abc ) {
2007-02-09 17:24:47 +03:00
/* RFC3465: Appropriate Byte Count
* increase once for each full cwnd acked
*/
if ( tp - > bytes_acked > = tp - > snd_cwnd * tp - > mss_cache ) {
tp - > bytes_acked - = tp - > snd_cwnd * tp - > mss_cache ;
if ( tp - > snd_cwnd < tp - > snd_cwnd_clamp )
tp - > snd_cwnd + + ;
}
} else {
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */
if ( tp - > snd_cwnd_cnt > = tp - > snd_cwnd ) {
if ( tp - > snd_cwnd < tp - > snd_cwnd_clamp )
tp - > snd_cwnd + + ;
tp - > snd_cwnd_cnt = 0 ;
} else
tp - > snd_cwnd_cnt + + ;
}
2005-06-23 23:19:55 +04:00
}
EXPORT_SYMBOL_GPL ( tcp_reno_cong_avoid ) ;
/* Slow start threshold is half the congestion window (min 2) */
2005-08-10 11:03:31 +04:00
u32 tcp_reno_ssthresh ( struct sock * sk )
2005-06-23 23:19:55 +04:00
{
2005-08-10 11:03:31 +04:00
const struct tcp_sock * tp = tcp_sk ( sk ) ;
2005-06-23 23:19:55 +04:00
return max ( tp - > snd_cwnd > > 1U , 2U ) ;
}
EXPORT_SYMBOL_GPL ( tcp_reno_ssthresh ) ;
2006-06-06 04:30:08 +04:00
/* Lower bound on congestion window with halving. */
u32 tcp_reno_min_cwnd ( const struct sock * sk )
2005-06-23 23:19:55 +04:00
{
2005-08-10 11:03:31 +04:00
const struct tcp_sock * tp = tcp_sk ( sk ) ;
2005-06-23 23:19:55 +04:00
return tp - > snd_ssthresh / 2 ;
}
EXPORT_SYMBOL_GPL ( tcp_reno_min_cwnd ) ;
struct tcp_congestion_ops tcp_reno = {
2007-04-24 09:26:16 +04:00
. flags = TCP_CONG_NON_RESTRICTED ,
2005-06-23 23:19:55 +04:00
. name = " reno " ,
. owner = THIS_MODULE ,
. ssthresh = tcp_reno_ssthresh ,
. cong_avoid = tcp_reno_cong_avoid ,
. min_cwnd = tcp_reno_min_cwnd ,
} ;
2005-06-24 07:37:36 +04:00
/* Initial congestion control used (until SYN)
* really reno under another name so we can tell difference
* during tcp_set_default_congestion_control
*/
struct tcp_congestion_ops tcp_init_congestion_ops = {
. name = " " ,
. owner = THIS_MODULE ,
. ssthresh = tcp_reno_ssthresh ,
. cong_avoid = tcp_reno_cong_avoid ,
. min_cwnd = tcp_reno_min_cwnd ,
} ;
EXPORT_SYMBOL_GPL ( tcp_init_congestion_ops ) ;