6ba8a3b19e
This patch series implement the Tail loss probe (TLP) algorithm described in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The first patch implements the basic algorithm. TLP's goal is to reduce tail latency of short transactions. It achieves this by converting retransmission timeouts (RTOs) occuring due to tail losses (losses at end of transactions) into fast recovery. TLP transmits one packet in two round-trips when a connection is in Open state and isn't receiving any ACKs. The transmitted packet, aka loss probe, can be either new or a retransmission. When there is tail loss, the ACK from a loss probe triggers FACK/early-retransmit based fast recovery, thus avoiding a costly RTO. In the absence of loss, there is no change in the connection state. PTO stands for probe timeout. It is a timer event indicating that an ACK is overdue and triggers a loss probe packet. The PTO value is set to max(2*SRTT, 10ms) and is adjusted to account for delayed ACK timer when there is only one oustanding packet. TLP Algorithm On transmission of new data in Open state: -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms). -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms) -> PTO = min(PTO, RTO) Conditions for scheduling PTO: -> Connection is in Open state. -> Connection is either cwnd limited or no new data to send. -> Number of probes per tail loss episode is limited to one. -> Connection is SACK enabled. When PTO fires: new_segment_exists: -> transmit new segment. -> packets_out++. cwnd remains same. no_new_packet: -> retransmit the last segment. Its ACK triggers FACK or early retransmit based recovery. ACK path: -> rearm RTO at start of ACK processing. -> reschedule PTO if need be. In addition, the patch includes a small variation to the Early Retransmit (ER) algorithm, such that ER and TLP together can in principle recover any N-degree of tail loss through fast recovery. TLP is controlled by the same sysctl as ER, tcp_early_retrans sysctl. tcp_early_retrans==0; disables TLP and ER. ==1; enables RFC5827 ER. ==2; delayed ER. ==3; TLP and delayed ER. [DEFAULT] ==4; TLP only. The TLP patch series have been extensively tested on Google Web servers. It is most effective for short Web trasactions, where it reduced RTOs by 15% and improved HTTP response time (average by 6%, 99th percentile by 10%). The transmitted probes account for <0.5% of the overall transmissions. Signed-off-by: Nandita Dukkipati <nanditad@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
952 lines
22 KiB
C
952 lines
22 KiB
C
/*
|
|
* sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
|
|
*
|
|
* Begun April 1, 1996, Mike Shaver.
|
|
* Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
|
|
*/
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/module.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/igmp.h>
|
|
#include <linux/inetdevice.h>
|
|
#include <linux/seqlock.h>
|
|
#include <linux/init.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/swap.h>
|
|
#include <net/snmp.h>
|
|
#include <net/icmp.h>
|
|
#include <net/ip.h>
|
|
#include <net/route.h>
|
|
#include <net/tcp.h>
|
|
#include <net/udp.h>
|
|
#include <net/cipso_ipv4.h>
|
|
#include <net/inet_frag.h>
|
|
#include <net/ping.h>
|
|
#include <net/tcp_memcontrol.h>
|
|
|
|
static int zero;
|
|
static int one = 1;
|
|
static int four = 4;
|
|
static int tcp_retr1_max = 255;
|
|
static int ip_local_port_range_min[] = { 1, 1 };
|
|
static int ip_local_port_range_max[] = { 65535, 65535 };
|
|
static int tcp_adv_win_scale_min = -31;
|
|
static int tcp_adv_win_scale_max = 31;
|
|
static int ip_ttl_min = 1;
|
|
static int ip_ttl_max = 255;
|
|
static int ip_ping_group_range_min[] = { 0, 0 };
|
|
static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
|
|
|
|
/* Update system visible IP port range */
|
|
static void set_local_port_range(int range[2])
|
|
{
|
|
write_seqlock(&sysctl_local_ports.lock);
|
|
sysctl_local_ports.range[0] = range[0];
|
|
sysctl_local_ports.range[1] = range[1];
|
|
write_sequnlock(&sysctl_local_ports.lock);
|
|
}
|
|
|
|
/* Validate changes from /proc interface. */
|
|
static int ipv4_local_port_range(ctl_table *table, int write,
|
|
void __user *buffer,
|
|
size_t *lenp, loff_t *ppos)
|
|
{
|
|
int ret;
|
|
int range[2];
|
|
ctl_table tmp = {
|
|
.data = &range,
|
|
.maxlen = sizeof(range),
|
|
.mode = table->mode,
|
|
.extra1 = &ip_local_port_range_min,
|
|
.extra2 = &ip_local_port_range_max,
|
|
};
|
|
|
|
inet_get_local_port_range(range, range + 1);
|
|
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
|
|
|
|
if (write && ret == 0) {
|
|
if (range[1] < range[0])
|
|
ret = -EINVAL;
|
|
else
|
|
set_local_port_range(range);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
|
|
{
|
|
kgid_t *data = table->data;
|
|
unsigned int seq;
|
|
do {
|
|
seq = read_seqbegin(&sysctl_local_ports.lock);
|
|
|
|
*low = data[0];
|
|
*high = data[1];
|
|
} while (read_seqretry(&sysctl_local_ports.lock, seq));
|
|
}
|
|
|
|
/* Update system visible IP port range */
|
|
static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
|
|
{
|
|
kgid_t *data = table->data;
|
|
write_seqlock(&sysctl_local_ports.lock);
|
|
data[0] = low;
|
|
data[1] = high;
|
|
write_sequnlock(&sysctl_local_ports.lock);
|
|
}
|
|
|
|
/* Validate changes from /proc interface. */
|
|
static int ipv4_ping_group_range(ctl_table *table, int write,
|
|
void __user *buffer,
|
|
size_t *lenp, loff_t *ppos)
|
|
{
|
|
struct user_namespace *user_ns = current_user_ns();
|
|
int ret;
|
|
gid_t urange[2];
|
|
kgid_t low, high;
|
|
ctl_table tmp = {
|
|
.data = &urange,
|
|
.maxlen = sizeof(urange),
|
|
.mode = table->mode,
|
|
.extra1 = &ip_ping_group_range_min,
|
|
.extra2 = &ip_ping_group_range_max,
|
|
};
|
|
|
|
inet_get_ping_group_range_table(table, &low, &high);
|
|
urange[0] = from_kgid_munged(user_ns, low);
|
|
urange[1] = from_kgid_munged(user_ns, high);
|
|
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
|
|
|
|
if (write && ret == 0) {
|
|
low = make_kgid(user_ns, urange[0]);
|
|
high = make_kgid(user_ns, urange[1]);
|
|
if (!gid_valid(low) || !gid_valid(high) ||
|
|
(urange[1] < urange[0]) || gid_lt(high, low)) {
|
|
low = make_kgid(&init_user_ns, 1);
|
|
high = make_kgid(&init_user_ns, 0);
|
|
}
|
|
set_ping_group_range(table, low, high);
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int proc_tcp_congestion_control(ctl_table *ctl, int write,
|
|
void __user *buffer, size_t *lenp, loff_t *ppos)
|
|
{
|
|
char val[TCP_CA_NAME_MAX];
|
|
ctl_table tbl = {
|
|
.data = val,
|
|
.maxlen = TCP_CA_NAME_MAX,
|
|
};
|
|
int ret;
|
|
|
|
tcp_get_default_congestion_control(val);
|
|
|
|
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
|
if (write && ret == 0)
|
|
ret = tcp_set_default_congestion_control(val);
|
|
return ret;
|
|
}
|
|
|
|
static int proc_tcp_available_congestion_control(ctl_table *ctl,
|
|
int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
{
|
|
ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
|
|
int ret;
|
|
|
|
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
|
|
if (!tbl.data)
|
|
return -ENOMEM;
|
|
tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX);
|
|
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
|
kfree(tbl.data);
|
|
return ret;
|
|
}
|
|
|
|
static int proc_allowed_congestion_control(ctl_table *ctl,
|
|
int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
{
|
|
ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
|
|
int ret;
|
|
|
|
tbl.data = kmalloc(tbl.maxlen, GFP_USER);
|
|
if (!tbl.data)
|
|
return -ENOMEM;
|
|
|
|
tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen);
|
|
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
|
if (write && ret == 0)
|
|
ret = tcp_set_allowed_congestion_control(tbl.data);
|
|
kfree(tbl.data);
|
|
return ret;
|
|
}
|
|
|
|
static int ipv4_tcp_mem(ctl_table *ctl, int write,
|
|
void __user *buffer, size_t *lenp,
|
|
loff_t *ppos)
|
|
{
|
|
int ret;
|
|
unsigned long vec[3];
|
|
struct net *net = current->nsproxy->net_ns;
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
struct mem_cgroup *memcg;
|
|
#endif
|
|
|
|
ctl_table tmp = {
|
|
.data = &vec,
|
|
.maxlen = sizeof(vec),
|
|
.mode = ctl->mode,
|
|
};
|
|
|
|
if (!write) {
|
|
ctl->data = &net->ipv4.sysctl_tcp_mem;
|
|
return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
|
|
}
|
|
|
|
ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
|
|
if (ret)
|
|
return ret;
|
|
|
|
#ifdef CONFIG_MEMCG_KMEM
|
|
rcu_read_lock();
|
|
memcg = mem_cgroup_from_task(current);
|
|
|
|
tcp_prot_mem(memcg, vec[0], 0);
|
|
tcp_prot_mem(memcg, vec[1], 1);
|
|
tcp_prot_mem(memcg, vec[2], 2);
|
|
rcu_read_unlock();
|
|
#endif
|
|
|
|
net->ipv4.sysctl_tcp_mem[0] = vec[0];
|
|
net->ipv4.sysctl_tcp_mem[1] = vec[1];
|
|
net->ipv4.sysctl_tcp_mem[2] = vec[2];
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer,
|
|
size_t *lenp, loff_t *ppos)
|
|
{
|
|
ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
|
|
struct tcp_fastopen_context *ctxt;
|
|
int ret;
|
|
u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
|
|
|
|
tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
|
|
if (!tbl.data)
|
|
return -ENOMEM;
|
|
|
|
rcu_read_lock();
|
|
ctxt = rcu_dereference(tcp_fastopen_ctx);
|
|
if (ctxt)
|
|
memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
|
|
else
|
|
memset(user_key, 0, sizeof(user_key));
|
|
rcu_read_unlock();
|
|
|
|
snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
|
|
user_key[0], user_key[1], user_key[2], user_key[3]);
|
|
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
|
|
|
|
if (write && ret == 0) {
|
|
if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
|
|
user_key + 2, user_key + 3) != 4) {
|
|
ret = -EINVAL;
|
|
goto bad_key;
|
|
}
|
|
tcp_fastopen_reset_cipher(user_key, TCP_FASTOPEN_KEY_LENGTH);
|
|
}
|
|
|
|
bad_key:
|
|
pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
|
|
user_key[0], user_key[1], user_key[2], user_key[3],
|
|
(char *)tbl.data, ret);
|
|
kfree(tbl.data);
|
|
return ret;
|
|
}
|
|
|
|
static struct ctl_table ipv4_table[] = {
|
|
{
|
|
.procname = "tcp_timestamps",
|
|
.data = &sysctl_tcp_timestamps,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_window_scaling",
|
|
.data = &sysctl_tcp_window_scaling,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_sack",
|
|
.data = &sysctl_tcp_sack,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_retrans_collapse",
|
|
.data = &sysctl_tcp_retrans_collapse,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "ip_default_ttl",
|
|
.data = &sysctl_ip_default_ttl,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &ip_ttl_min,
|
|
.extra2 = &ip_ttl_max,
|
|
},
|
|
{
|
|
.procname = "ip_no_pmtu_disc",
|
|
.data = &ipv4_config.no_pmtu_disc,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "ip_nonlocal_bind",
|
|
.data = &sysctl_ip_nonlocal_bind,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_syn_retries",
|
|
.data = &sysctl_tcp_syn_retries,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_synack_retries",
|
|
.data = &sysctl_tcp_synack_retries,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_max_orphans",
|
|
.data = &sysctl_tcp_max_orphans,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_max_tw_buckets",
|
|
.data = &tcp_death_row.sysctl_max_tw_buckets,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "ip_early_demux",
|
|
.data = &sysctl_ip_early_demux,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "ip_dynaddr",
|
|
.data = &sysctl_ip_dynaddr,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_keepalive_time",
|
|
.data = &sysctl_tcp_keepalive_time,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_jiffies,
|
|
},
|
|
{
|
|
.procname = "tcp_keepalive_probes",
|
|
.data = &sysctl_tcp_keepalive_probes,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_keepalive_intvl",
|
|
.data = &sysctl_tcp_keepalive_intvl,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_jiffies,
|
|
},
|
|
{
|
|
.procname = "tcp_retries1",
|
|
.data = &sysctl_tcp_retries1,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra2 = &tcp_retr1_max
|
|
},
|
|
{
|
|
.procname = "tcp_retries2",
|
|
.data = &sysctl_tcp_retries2,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_fin_timeout",
|
|
.data = &sysctl_tcp_fin_timeout,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_jiffies,
|
|
},
|
|
#ifdef CONFIG_SYN_COOKIES
|
|
{
|
|
.procname = "tcp_syncookies",
|
|
.data = &sysctl_tcp_syncookies,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
#endif
|
|
{
|
|
.procname = "tcp_fastopen",
|
|
.data = &sysctl_tcp_fastopen,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "tcp_fastopen_key",
|
|
.mode = 0600,
|
|
.maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
|
|
.proc_handler = proc_tcp_fastopen_key,
|
|
},
|
|
{
|
|
.procname = "tcp_tw_recycle",
|
|
.data = &tcp_death_row.sysctl_tw_recycle,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_abort_on_overflow",
|
|
.data = &sysctl_tcp_abort_on_overflow,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_stdurg",
|
|
.data = &sysctl_tcp_stdurg,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_rfc1337",
|
|
.data = &sysctl_tcp_rfc1337,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_max_syn_backlog",
|
|
.data = &sysctl_max_syn_backlog,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "ip_local_port_range",
|
|
.data = &sysctl_local_ports.range,
|
|
.maxlen = sizeof(sysctl_local_ports.range),
|
|
.mode = 0644,
|
|
.proc_handler = ipv4_local_port_range,
|
|
},
|
|
{
|
|
.procname = "ip_local_reserved_ports",
|
|
.data = NULL, /* initialized in sysctl_ipv4_init */
|
|
.maxlen = 65536,
|
|
.mode = 0644,
|
|
.proc_handler = proc_do_large_bitmap,
|
|
},
|
|
{
|
|
.procname = "igmp_max_memberships",
|
|
.data = &sysctl_igmp_max_memberships,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "igmp_max_msf",
|
|
.data = &sysctl_igmp_max_msf,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "inet_peer_threshold",
|
|
.data = &inet_peer_threshold,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "inet_peer_minttl",
|
|
.data = &inet_peer_minttl,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_jiffies,
|
|
},
|
|
{
|
|
.procname = "inet_peer_maxttl",
|
|
.data = &inet_peer_maxttl,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_jiffies,
|
|
},
|
|
{
|
|
.procname = "tcp_orphan_retries",
|
|
.data = &sysctl_tcp_orphan_retries,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_fack",
|
|
.data = &sysctl_tcp_fack,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_reordering",
|
|
.data = &sysctl_tcp_reordering,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_dsack",
|
|
.data = &sysctl_tcp_dsack,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_wmem",
|
|
.data = &sysctl_tcp_wmem,
|
|
.maxlen = sizeof(sysctl_tcp_wmem),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &one,
|
|
},
|
|
{
|
|
.procname = "tcp_rmem",
|
|
.data = &sysctl_tcp_rmem,
|
|
.maxlen = sizeof(sysctl_tcp_rmem),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &one,
|
|
},
|
|
{
|
|
.procname = "tcp_app_win",
|
|
.data = &sysctl_tcp_app_win,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_adv_win_scale",
|
|
.data = &sysctl_tcp_adv_win_scale,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &tcp_adv_win_scale_min,
|
|
.extra2 = &tcp_adv_win_scale_max,
|
|
},
|
|
{
|
|
.procname = "tcp_tw_reuse",
|
|
.data = &sysctl_tcp_tw_reuse,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_frto",
|
|
.data = &sysctl_tcp_frto,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_frto_response",
|
|
.data = &sysctl_tcp_frto_response,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_low_latency",
|
|
.data = &sysctl_tcp_low_latency,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_no_metrics_save",
|
|
.data = &sysctl_tcp_nometrics_save,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "tcp_moderate_rcvbuf",
|
|
.data = &sysctl_tcp_moderate_rcvbuf,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "tcp_tso_win_divisor",
|
|
.data = &sysctl_tcp_tso_win_divisor,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "tcp_congestion_control",
|
|
.mode = 0644,
|
|
.maxlen = TCP_CA_NAME_MAX,
|
|
.proc_handler = proc_tcp_congestion_control,
|
|
},
|
|
{
|
|
.procname = "tcp_mtu_probing",
|
|
.data = &sysctl_tcp_mtu_probing,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "tcp_base_mss",
|
|
.data = &sysctl_tcp_base_mss,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "tcp_workaround_signed_windows",
|
|
.data = &sysctl_tcp_workaround_signed_windows,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_limit_output_bytes",
|
|
.data = &sysctl_tcp_limit_output_bytes,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_challenge_ack_limit",
|
|
.data = &sysctl_tcp_challenge_ack_limit,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
#ifdef CONFIG_NET_DMA
|
|
{
|
|
.procname = "tcp_dma_copybreak",
|
|
.data = &sysctl_tcp_dma_copybreak,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
#endif
|
|
{
|
|
.procname = "tcp_slow_start_after_idle",
|
|
.data = &sysctl_tcp_slow_start_after_idle,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
#ifdef CONFIG_NETLABEL
|
|
{
|
|
.procname = "cipso_cache_enable",
|
|
.data = &cipso_v4_cache_enabled,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "cipso_cache_bucket_size",
|
|
.data = &cipso_v4_cache_bucketsize,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "cipso_rbm_optfmt",
|
|
.data = &cipso_v4_rbm_optfmt,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "cipso_rbm_strictvalid",
|
|
.data = &cipso_v4_rbm_strictvalid,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
#endif /* CONFIG_NETLABEL */
|
|
{
|
|
.procname = "tcp_available_congestion_control",
|
|
.maxlen = TCP_CA_BUF_MAX,
|
|
.mode = 0444,
|
|
.proc_handler = proc_tcp_available_congestion_control,
|
|
},
|
|
{
|
|
.procname = "tcp_allowed_congestion_control",
|
|
.maxlen = TCP_CA_BUF_MAX,
|
|
.mode = 0644,
|
|
.proc_handler = proc_allowed_congestion_control,
|
|
},
|
|
{
|
|
.procname = "tcp_max_ssthresh",
|
|
.data = &sysctl_tcp_max_ssthresh,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec,
|
|
},
|
|
{
|
|
.procname = "tcp_cookie_size",
|
|
.data = &sysctl_tcp_cookie_size,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_thin_linear_timeouts",
|
|
.data = &sysctl_tcp_thin_linear_timeouts,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_thin_dupack",
|
|
.data = &sysctl_tcp_thin_dupack,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_early_retrans",
|
|
.data = &sysctl_tcp_early_retrans,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &zero,
|
|
.extra2 = &four,
|
|
},
|
|
{
|
|
.procname = "udp_mem",
|
|
.data = &sysctl_udp_mem,
|
|
.maxlen = sizeof(sysctl_udp_mem),
|
|
.mode = 0644,
|
|
.proc_handler = proc_doulongvec_minmax,
|
|
},
|
|
{
|
|
.procname = "udp_rmem_min",
|
|
.data = &sysctl_udp_rmem_min,
|
|
.maxlen = sizeof(sysctl_udp_rmem_min),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &one
|
|
},
|
|
{
|
|
.procname = "udp_wmem_min",
|
|
.data = &sysctl_udp_wmem_min,
|
|
.maxlen = sizeof(sysctl_udp_wmem_min),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_minmax,
|
|
.extra1 = &one
|
|
},
|
|
{ }
|
|
};
|
|
|
|
static struct ctl_table ipv4_net_table[] = {
|
|
{
|
|
.procname = "icmp_echo_ignore_all",
|
|
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "icmp_echo_ignore_broadcasts",
|
|
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "icmp_ignore_bogus_error_responses",
|
|
.data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "icmp_errors_use_inbound_ifaddr",
|
|
.data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "icmp_ratelimit",
|
|
.data = &init_net.ipv4.sysctl_icmp_ratelimit,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec_ms_jiffies,
|
|
},
|
|
{
|
|
.procname = "icmp_ratemask",
|
|
.data = &init_net.ipv4.sysctl_icmp_ratemask,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "ping_group_range",
|
|
.data = &init_net.ipv4.sysctl_ping_group_range,
|
|
.maxlen = sizeof(gid_t)*2,
|
|
.mode = 0644,
|
|
.proc_handler = ipv4_ping_group_range,
|
|
},
|
|
{
|
|
.procname = "tcp_ecn",
|
|
.data = &init_net.ipv4.sysctl_tcp_ecn,
|
|
.maxlen = sizeof(int),
|
|
.mode = 0644,
|
|
.proc_handler = proc_dointvec
|
|
},
|
|
{
|
|
.procname = "tcp_mem",
|
|
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_mem),
|
|
.mode = 0644,
|
|
.proc_handler = ipv4_tcp_mem,
|
|
},
|
|
{ }
|
|
};
|
|
|
|
static __net_init int ipv4_sysctl_init_net(struct net *net)
|
|
{
|
|
struct ctl_table *table;
|
|
|
|
table = ipv4_net_table;
|
|
if (!net_eq(net, &init_net)) {
|
|
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
|
|
if (table == NULL)
|
|
goto err_alloc;
|
|
|
|
table[0].data =
|
|
&net->ipv4.sysctl_icmp_echo_ignore_all;
|
|
table[1].data =
|
|
&net->ipv4.sysctl_icmp_echo_ignore_broadcasts;
|
|
table[2].data =
|
|
&net->ipv4.sysctl_icmp_ignore_bogus_error_responses;
|
|
table[3].data =
|
|
&net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr;
|
|
table[4].data =
|
|
&net->ipv4.sysctl_icmp_ratelimit;
|
|
table[5].data =
|
|
&net->ipv4.sysctl_icmp_ratemask;
|
|
table[6].data =
|
|
&net->ipv4.sysctl_ping_group_range;
|
|
table[7].data =
|
|
&net->ipv4.sysctl_tcp_ecn;
|
|
|
|
/* Don't export sysctls to unprivileged users */
|
|
if (net->user_ns != &init_user_ns)
|
|
table[0].procname = NULL;
|
|
}
|
|
|
|
/*
|
|
* Sane defaults - nobody may create ping sockets.
|
|
* Boot scripts should set this to distro-specific group.
|
|
*/
|
|
net->ipv4.sysctl_ping_group_range[0] = make_kgid(&init_user_ns, 1);
|
|
net->ipv4.sysctl_ping_group_range[1] = make_kgid(&init_user_ns, 0);
|
|
|
|
tcp_init_mem(net);
|
|
|
|
net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
|
|
if (net->ipv4.ipv4_hdr == NULL)
|
|
goto err_reg;
|
|
|
|
return 0;
|
|
|
|
err_reg:
|
|
if (!net_eq(net, &init_net))
|
|
kfree(table);
|
|
err_alloc:
|
|
return -ENOMEM;
|
|
}
|
|
|
|
static __net_exit void ipv4_sysctl_exit_net(struct net *net)
|
|
{
|
|
struct ctl_table *table;
|
|
|
|
table = net->ipv4.ipv4_hdr->ctl_table_arg;
|
|
unregister_net_sysctl_table(net->ipv4.ipv4_hdr);
|
|
kfree(table);
|
|
}
|
|
|
|
static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
|
|
.init = ipv4_sysctl_init_net,
|
|
.exit = ipv4_sysctl_exit_net,
|
|
};
|
|
|
|
static __init int sysctl_ipv4_init(void)
|
|
{
|
|
struct ctl_table_header *hdr;
|
|
struct ctl_table *i;
|
|
|
|
for (i = ipv4_table; i->procname; i++) {
|
|
if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
|
|
i->data = sysctl_local_reserved_ports;
|
|
break;
|
|
}
|
|
}
|
|
if (!i->procname)
|
|
return -EINVAL;
|
|
|
|
hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table);
|
|
if (hdr == NULL)
|
|
return -ENOMEM;
|
|
|
|
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
|
|
unregister_net_sysctl_table(hdr);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
__initcall(sysctl_ipv4_init);
|