Merge branch 'tcp-plb'
Mubashir Adnan Qureshi says: ==================== net: Add PLB functionality to TCP This patch series adds PLB (Protective Load Balancing) to TCP and hooks it up to DCTCP. PLB is disabled by default and can be enabled using relevant sysctls and support from underlying CC. PLB (Protective Load Balancing) is a host based mechanism for load balancing across switch links. It leverages congestion signals(e.g. ECN) from transport layer to randomly change the path of the connection experiencing congestion. PLB changes the path of the connection by changing the outgoing IPv6 flow label for IPv6 connections (implemented in Linux by calling sk_rethink_txhash()). Because of this implementation mechanism, PLB can currently only work for IPv6 traffic. For more information, see the SIGCOMM 2022 paper: https://doi.org/10.1145/3544216.3544226 ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
957ed5e712
@ -1069,6 +1069,81 @@ tcp_child_ehash_entries - INTEGER
|
|||||||
|
|
||||||
Default: 0
|
Default: 0
|
||||||
|
|
||||||
|
tcp_plb_enabled - BOOLEAN
|
||||||
|
If set and the underlying congestion control (e.g. DCTCP) supports
|
||||||
|
and enables PLB feature, TCP PLB (Protective Load Balancing) is
|
||||||
|
enabled. PLB is described in the following paper:
|
||||||
|
https://doi.org/10.1145/3544216.3544226. Based on PLB parameters,
|
||||||
|
upon sensing sustained congestion, TCP triggers a change in
|
||||||
|
flow label field for outgoing IPv6 packets. A change in flow label
|
||||||
|
field potentially changes the path of outgoing packets for switches
|
||||||
|
that use ECMP/WCMP for routing.
|
||||||
|
|
||||||
|
PLB changes socket txhash which results in a change in IPv6 Flow Label
|
||||||
|
field, and currently no-op for IPv4 headers. It is possible
|
||||||
|
to apply PLB for IPv4 with other network header fields (e.g. TCP
|
||||||
|
or IPv4 options) or using encapsulation where outer header is used
|
||||||
|
by switches to determine next hop. In either case, further host
|
||||||
|
and switch side changes will be needed.
|
||||||
|
|
||||||
|
When set, PLB assumes that congestion signal (e.g. ECN) is made
|
||||||
|
available and used by congestion control module to estimate a
|
||||||
|
congestion measure (e.g. ce_ratio). PLB needs a congestion measure to
|
||||||
|
make repathing decisions.
|
||||||
|
|
||||||
|
Default: FALSE
|
||||||
|
|
||||||
|
tcp_plb_idle_rehash_rounds - INTEGER
|
||||||
|
Number of consecutive congested rounds (RTT) seen after which
|
||||||
|
a rehash can be performed, given there are no packets in flight.
|
||||||
|
This is referred to as M in PLB paper:
|
||||||
|
https://doi.org/10.1145/3544216.3544226.
|
||||||
|
|
||||||
|
Possible Values: 0 - 31
|
||||||
|
|
||||||
|
Default: 3
|
||||||
|
|
||||||
|
tcp_plb_rehash_rounds - INTEGER
|
||||||
|
Number of consecutive congested rounds (RTT) seen after which
|
||||||
|
a forced rehash can be performed. Be careful when setting this
|
||||||
|
parameter, as a small value increases the risk of retransmissions.
|
||||||
|
This is referred to as N in PLB paper:
|
||||||
|
https://doi.org/10.1145/3544216.3544226.
|
||||||
|
|
||||||
|
Possible Values: 0 - 31
|
||||||
|
|
||||||
|
Default: 12
|
||||||
|
|
||||||
|
tcp_plb_suspend_rto_sec - INTEGER
|
||||||
|
Time, in seconds, to suspend PLB in event of an RTO. In order to avoid
|
||||||
|
having PLB repath onto a connectivity "black hole", after an RTO a TCP
|
||||||
|
connection suspends PLB repathing for a random duration between 1x and
|
||||||
|
2x of this parameter. Randomness is added to avoid concurrent rehashing
|
||||||
|
of multiple TCP connections. This should be set corresponding to the
|
||||||
|
amount of time it takes to repair a failed link.
|
||||||
|
|
||||||
|
Possible Values: 0 - 255
|
||||||
|
|
||||||
|
Default: 60
|
||||||
|
|
||||||
|
tcp_plb_cong_thresh - INTEGER
|
||||||
|
Fraction of packets marked with congestion over a round (RTT) to
|
||||||
|
tag that round as congested. This is referred to as K in the PLB paper:
|
||||||
|
https://doi.org/10.1145/3544216.3544226.
|
||||||
|
|
||||||
|
The 0-1 fraction range is mapped to 0-256 range to avoid floating
|
||||||
|
point operations. For example, 128 means that if at least 50% of
|
||||||
|
the packets in a round were marked as congested then the round
|
||||||
|
will be tagged as congested.
|
||||||
|
|
||||||
|
Setting threshold to 0 means that PLB repaths every RTT regardless
|
||||||
|
of congestion. This is not intended behavior for PLB and should be
|
||||||
|
used only for experimentation purpose.
|
||||||
|
|
||||||
|
Possible Values: 0 - 256
|
||||||
|
|
||||||
|
Default: 128
|
||||||
|
|
||||||
UDP variables
|
UDP variables
|
||||||
=============
|
=============
|
||||||
|
|
||||||
|
@ -423,6 +423,7 @@ struct tcp_sock {
|
|||||||
u32 probe_seq_start;
|
u32 probe_seq_start;
|
||||||
u32 probe_seq_end;
|
u32 probe_seq_end;
|
||||||
} mtu_probe;
|
} mtu_probe;
|
||||||
|
u32 plb_rehash; /* PLB-triggered rehash attempts */
|
||||||
u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
|
u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
|
||||||
* while socket was owned by user.
|
* while socket was owned by user.
|
||||||
*/
|
*/
|
||||||
|
@ -183,6 +183,11 @@ struct netns_ipv4 {
|
|||||||
unsigned long tfo_active_disable_stamp;
|
unsigned long tfo_active_disable_stamp;
|
||||||
u32 tcp_challenge_timestamp;
|
u32 tcp_challenge_timestamp;
|
||||||
u32 tcp_challenge_count;
|
u32 tcp_challenge_count;
|
||||||
|
u8 sysctl_tcp_plb_enabled;
|
||||||
|
u8 sysctl_tcp_plb_idle_rehash_rounds;
|
||||||
|
u8 sysctl_tcp_plb_rehash_rounds;
|
||||||
|
u8 sysctl_tcp_plb_suspend_rto_sec;
|
||||||
|
int sysctl_tcp_plb_cong_thresh;
|
||||||
|
|
||||||
int sysctl_udp_wmem_min;
|
int sysctl_udp_wmem_min;
|
||||||
int sysctl_udp_rmem_min;
|
int sysctl_udp_rmem_min;
|
||||||
|
@ -2140,6 +2140,34 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
|
|||||||
extern void tcp_rack_reo_timeout(struct sock *sk);
|
extern void tcp_rack_reo_timeout(struct sock *sk);
|
||||||
extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
|
extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
|
||||||
|
|
||||||
|
/* tcp_plb.c */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Scaling factor for fractions in PLB. For example, tcp_plb_update_state
|
||||||
|
* expects cong_ratio which represents fraction of traffic that experienced
|
||||||
|
* congestion over a single RTT. In order to avoid floating point operations,
|
||||||
|
* this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in.
|
||||||
|
*/
|
||||||
|
#define TCP_PLB_SCALE 8
|
||||||
|
|
||||||
|
/* State for PLB (Protective Load Balancing) for a single TCP connection. */
|
||||||
|
struct tcp_plb_state {
|
||||||
|
u8 consec_cong_rounds:5, /* consecutive congested rounds */
|
||||||
|
unused:3;
|
||||||
|
u32 pause_until; /* jiffies32 when PLB can resume rerouting */
|
||||||
|
};
|
||||||
|
|
||||||
|
static inline void tcp_plb_init(const struct sock *sk,
|
||||||
|
struct tcp_plb_state *plb)
|
||||||
|
{
|
||||||
|
plb->consec_cong_rounds = 0;
|
||||||
|
plb->pause_until = 0;
|
||||||
|
}
|
||||||
|
void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
|
||||||
|
const int cong_ratio);
|
||||||
|
void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb);
|
||||||
|
void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb);
|
||||||
|
|
||||||
/* At how many usecs into the future should the RTO fire? */
|
/* At how many usecs into the future should the RTO fire? */
|
||||||
static inline s64 tcp_rto_delta_us(const struct sock *sk)
|
static inline s64 tcp_rto_delta_us(const struct sock *sk)
|
||||||
{
|
{
|
||||||
|
@ -292,6 +292,7 @@ enum
|
|||||||
LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */
|
LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */
|
||||||
LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */
|
LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */
|
||||||
LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */
|
LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */
|
||||||
|
LINUX_MIB_TCPPLBREHASH, /* TCPPLBRehash */
|
||||||
__LINUX_MIB_MAX
|
__LINUX_MIB_MAX
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -284,6 +284,11 @@ struct tcp_info {
|
|||||||
__u32 tcpi_snd_wnd; /* peer's advertised receive window after
|
__u32 tcpi_snd_wnd; /* peer's advertised receive window after
|
||||||
* scaling (bytes)
|
* scaling (bytes)
|
||||||
*/
|
*/
|
||||||
|
__u32 tcpi_rcv_wnd; /* local advertised receive window after
|
||||||
|
* scaling (bytes)
|
||||||
|
*/
|
||||||
|
|
||||||
|
__u32 tcpi_rehash; /* PLB or timeout triggered rehash attempts */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
|
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
|
||||||
@ -315,6 +320,7 @@ enum {
|
|||||||
TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */
|
TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */
|
||||||
TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */
|
TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */
|
||||||
TCP_NLA_TTL, /* TTL or hop limit of a packet received */
|
TCP_NLA_TTL, /* TTL or hop limit of a packet received */
|
||||||
|
TCP_NLA_REHASH, /* PLB and timeout triggered rehash attempts */
|
||||||
};
|
};
|
||||||
|
|
||||||
/* for TCP_MD5SIG socket option */
|
/* for TCP_MD5SIG socket option */
|
||||||
|
@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \
|
|||||||
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
|
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
|
||||||
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
|
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
|
||||||
tcp_rate.o tcp_recovery.o tcp_ulp.o \
|
tcp_rate.o tcp_recovery.o tcp_ulp.o \
|
||||||
tcp_offload.o datagram.o raw.o udp.o udplite.o \
|
tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
|
||||||
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
|
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
|
||||||
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
|
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
|
||||||
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
|
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
|
||||||
|
@ -297,6 +297,7 @@ static const struct snmp_mib snmp4_net_list[] = {
|
|||||||
SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
|
SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
|
||||||
SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
|
SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
|
||||||
SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
|
SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
|
||||||
|
SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH),
|
||||||
SNMP_MIB_SENTINEL
|
SNMP_MIB_SENTINEL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -40,6 +40,8 @@ static int one_day_secs = 24 * 3600;
|
|||||||
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
|
static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
|
||||||
FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
|
FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
|
||||||
static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
|
static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
|
||||||
|
static int tcp_plb_max_rounds = 31;
|
||||||
|
static int tcp_plb_max_cong_thresh = 256;
|
||||||
|
|
||||||
/* obsolete */
|
/* obsolete */
|
||||||
static int sysctl_tcp_low_latency __read_mostly;
|
static int sysctl_tcp_low_latency __read_mostly;
|
||||||
@ -1384,6 +1386,47 @@ static struct ctl_table ipv4_net_table[] = {
|
|||||||
.extra1 = SYSCTL_ZERO,
|
.extra1 = SYSCTL_ZERO,
|
||||||
.extra2 = SYSCTL_TWO,
|
.extra2 = SYSCTL_TWO,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_plb_enabled",
|
||||||
|
.data = &init_net.ipv4.sysctl_tcp_plb_enabled,
|
||||||
|
.maxlen = sizeof(u8),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dou8vec_minmax,
|
||||||
|
.extra1 = SYSCTL_ZERO,
|
||||||
|
.extra2 = SYSCTL_ONE,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_plb_idle_rehash_rounds",
|
||||||
|
.data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds,
|
||||||
|
.maxlen = sizeof(u8),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dou8vec_minmax,
|
||||||
|
.extra2 = &tcp_plb_max_rounds,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_plb_rehash_rounds",
|
||||||
|
.data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds,
|
||||||
|
.maxlen = sizeof(u8),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dou8vec_minmax,
|
||||||
|
.extra2 = &tcp_plb_max_rounds,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_plb_suspend_rto_sec",
|
||||||
|
.data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec,
|
||||||
|
.maxlen = sizeof(u8),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dou8vec_minmax,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_plb_cong_thresh",
|
||||||
|
.data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh,
|
||||||
|
.maxlen = sizeof(int),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dointvec_minmax,
|
||||||
|
.extra1 = SYSCTL_ZERO,
|
||||||
|
.extra2 = &tcp_plb_max_cong_thresh,
|
||||||
|
},
|
||||||
{ }
|
{ }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3176,6 +3176,7 @@ int tcp_disconnect(struct sock *sk, int flags)
|
|||||||
tp->sacked_out = 0;
|
tp->sacked_out = 0;
|
||||||
tp->tlp_high_seq = 0;
|
tp->tlp_high_seq = 0;
|
||||||
tp->last_oow_ack_time = 0;
|
tp->last_oow_ack_time = 0;
|
||||||
|
tp->plb_rehash = 0;
|
||||||
/* There's a bubble in the pipe until at least the first ACK. */
|
/* There's a bubble in the pipe until at least the first ACK. */
|
||||||
tp->app_limited = ~0U;
|
tp->app_limited = ~0U;
|
||||||
tp->rack.mstamp = 0;
|
tp->rack.mstamp = 0;
|
||||||
@ -3939,6 +3940,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
|
|||||||
info->tcpi_reord_seen = tp->reord_seen;
|
info->tcpi_reord_seen = tp->reord_seen;
|
||||||
info->tcpi_rcv_ooopack = tp->rcv_ooopack;
|
info->tcpi_rcv_ooopack = tp->rcv_ooopack;
|
||||||
info->tcpi_snd_wnd = tp->snd_wnd;
|
info->tcpi_snd_wnd = tp->snd_wnd;
|
||||||
|
info->tcpi_rcv_wnd = tp->rcv_wnd;
|
||||||
|
info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
|
||||||
info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
|
info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
|
||||||
unlock_sock_fast(sk, slow);
|
unlock_sock_fast(sk, slow);
|
||||||
}
|
}
|
||||||
@ -3973,6 +3976,7 @@ static size_t tcp_opt_stats_get_size(void)
|
|||||||
nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
|
nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
|
||||||
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
|
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
|
||||||
nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
|
nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
|
||||||
|
nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
|
||||||
0;
|
0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4049,6 +4053,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
|
|||||||
nla_put_u8(stats, TCP_NLA_TTL,
|
nla_put_u8(stats, TCP_NLA_TTL,
|
||||||
tcp_skb_ttl_or_hop_limit(ack_skb));
|
tcp_skb_ttl_or_hop_limit(ack_skb));
|
||||||
|
|
||||||
|
nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
|
||||||
return stats;
|
return stats;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -54,6 +54,7 @@ struct dctcp {
|
|||||||
u32 next_seq;
|
u32 next_seq;
|
||||||
u32 ce_state;
|
u32 ce_state;
|
||||||
u32 loss_cwnd;
|
u32 loss_cwnd;
|
||||||
|
struct tcp_plb_state plb;
|
||||||
};
|
};
|
||||||
|
|
||||||
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
|
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
|
||||||
@ -91,6 +92,8 @@ static void dctcp_init(struct sock *sk)
|
|||||||
ca->ce_state = 0;
|
ca->ce_state = 0;
|
||||||
|
|
||||||
dctcp_reset(tp, ca);
|
dctcp_reset(tp, ca);
|
||||||
|
tcp_plb_init(sk, &ca->plb);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -117,14 +120,28 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
|
|||||||
|
|
||||||
/* Expired RTT */
|
/* Expired RTT */
|
||||||
if (!before(tp->snd_una, ca->next_seq)) {
|
if (!before(tp->snd_una, ca->next_seq)) {
|
||||||
|
u32 delivered = tp->delivered - ca->old_delivered;
|
||||||
u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
|
u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
|
||||||
u32 alpha = ca->dctcp_alpha;
|
u32 alpha = ca->dctcp_alpha;
|
||||||
|
u32 ce_ratio = 0;
|
||||||
|
|
||||||
|
if (delivered > 0) {
|
||||||
|
/* dctcp_alpha keeps EWMA of fraction of ECN marked
|
||||||
|
* packets. Because of EWMA smoothing, PLB reaction can
|
||||||
|
* be slow so we use ce_ratio which is an instantaneous
|
||||||
|
* measure of congestion. ce_ratio is the fraction of
|
||||||
|
* ECN marked packets in the previous RTT.
|
||||||
|
*/
|
||||||
|
if (delivered_ce > 0)
|
||||||
|
ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered;
|
||||||
|
tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio);
|
||||||
|
tcp_plb_check_rehash(sk, &ca->plb);
|
||||||
|
}
|
||||||
|
|
||||||
/* alpha = (1 - g) * alpha + g * F */
|
/* alpha = (1 - g) * alpha + g * F */
|
||||||
|
|
||||||
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
|
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
|
||||||
if (delivered_ce) {
|
if (delivered_ce) {
|
||||||
u32 delivered = tp->delivered - ca->old_delivered;
|
|
||||||
|
|
||||||
/* If dctcp_shift_g == 1, a 32bit value would overflow
|
/* If dctcp_shift_g == 1, a 32bit value would overflow
|
||||||
* after 8 M packets.
|
* after 8 M packets.
|
||||||
@ -172,8 +189,12 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
|
|||||||
dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
|
dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
|
||||||
break;
|
break;
|
||||||
case CA_EVENT_LOSS:
|
case CA_EVENT_LOSS:
|
||||||
|
tcp_plb_update_state_upon_rto(sk, &ca->plb);
|
||||||
dctcp_react_to_loss(sk);
|
dctcp_react_to_loss(sk);
|
||||||
break;
|
break;
|
||||||
|
case CA_EVENT_TX_START:
|
||||||
|
tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
/* Don't care for the rest. */
|
/* Don't care for the rest. */
|
||||||
break;
|
break;
|
||||||
|
@ -3218,6 +3218,14 @@ static int __net_init tcp_sk_init(struct net *net)
|
|||||||
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
|
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
|
||||||
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
|
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
|
||||||
|
|
||||||
|
/* Set default values for PLB */
|
||||||
|
net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
|
||||||
|
net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
|
||||||
|
net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
|
||||||
|
net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
|
||||||
|
/* Default congestion threshold for PLB to mark a round is 50% */
|
||||||
|
net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
|
||||||
|
|
||||||
/* Reno is always built in */
|
/* Reno is always built in */
|
||||||
if (!net_eq(net, &init_net) &&
|
if (!net_eq(net, &init_net) &&
|
||||||
bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
|
bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
|
||||||
|
109
net/ipv4/tcp_plb.c
Normal file
109
net/ipv4/tcp_plb.c
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
/* Protective Load Balancing (PLB)
|
||||||
|
*
|
||||||
|
* PLB was designed to reduce link load imbalance across datacenter
|
||||||
|
* switches. PLB is a host-based optimization; it leverages congestion
|
||||||
|
* signals from the transport layer to randomly change the path of the
|
||||||
|
* connection experiencing sustained congestion. PLB prefers to repath
|
||||||
|
* after idle periods to minimize packet reordering. It repaths by
|
||||||
|
* changing the IPv6 Flow Label on the packets of a connection, which
|
||||||
|
* datacenter switches include as part of ECMP/WCMP hashing.
|
||||||
|
*
|
||||||
|
* PLB is described in detail in:
|
||||||
|
*
|
||||||
|
* Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
|
||||||
|
* Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
|
||||||
|
* David Wetherall,Abdul Kabbani:
|
||||||
|
* "PLB: Congestion Signals are Simple and Effective for
|
||||||
|
* Network Load Balancing"
|
||||||
|
* In ACM SIGCOMM 2022, Amsterdam Netherlands.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <net/tcp.h>
|
||||||
|
|
||||||
|
/* Called once per round-trip to update PLB state for a connection. */
|
||||||
|
void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
|
||||||
|
const int cong_ratio)
|
||||||
|
{
|
||||||
|
struct net *net = sock_net(sk);
|
||||||
|
|
||||||
|
if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (cong_ratio >= 0) {
|
||||||
|
if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh))
|
||||||
|
plb->consec_cong_rounds = 0;
|
||||||
|
else if (plb->consec_cong_rounds <
|
||||||
|
READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds))
|
||||||
|
plb->consec_cong_rounds++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(tcp_plb_update_state);
|
||||||
|
|
||||||
|
/* Check whether recent congestion has been persistent enough to warrant
|
||||||
|
* a load balancing decision that switches the connection to another path.
|
||||||
|
*/
|
||||||
|
void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
|
||||||
|
{
|
||||||
|
struct net *net = sock_net(sk);
|
||||||
|
u32 max_suspend;
|
||||||
|
bool forced_rehash = false, idle_rehash = false;
|
||||||
|
|
||||||
|
if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
|
||||||
|
return;
|
||||||
|
|
||||||
|
forced_rehash = plb->consec_cong_rounds >=
|
||||||
|
READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds);
|
||||||
|
/* If sender goes idle then we check whether to rehash. */
|
||||||
|
idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) &&
|
||||||
|
!tcp_sk(sk)->packets_out &&
|
||||||
|
plb->consec_cong_rounds >=
|
||||||
|
READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds);
|
||||||
|
|
||||||
|
if (!forced_rehash && !idle_rehash)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Note that tcp_jiffies32 can wrap; we detect wraps by checking for
|
||||||
|
* cases where the max suspension end is before the actual suspension
|
||||||
|
* end. We clear pause_until to 0 to indicate there is no recent
|
||||||
|
* RTO event that constrains PLB rehashing.
|
||||||
|
*/
|
||||||
|
max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
|
||||||
|
if (plb->pause_until &&
|
||||||
|
(!before(tcp_jiffies32, plb->pause_until) ||
|
||||||
|
before(tcp_jiffies32 + max_suspend, plb->pause_until)))
|
||||||
|
plb->pause_until = 0;
|
||||||
|
|
||||||
|
if (plb->pause_until)
|
||||||
|
return;
|
||||||
|
|
||||||
|
sk_rethink_txhash(sk);
|
||||||
|
plb->consec_cong_rounds = 0;
|
||||||
|
tcp_sk(sk)->plb_rehash++;
|
||||||
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
|
||||||
|
|
||||||
|
/* Upon RTO, disallow load balancing for a while, to avoid having load
|
||||||
|
* balancing decisions switch traffic to a black-holed path that was
|
||||||
|
* previously avoided with a sk_rethink_txhash() call at RTO time.
|
||||||
|
*/
|
||||||
|
void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
|
||||||
|
{
|
||||||
|
struct net *net = sock_net(sk);
|
||||||
|
u32 pause;
|
||||||
|
|
||||||
|
if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
|
||||||
|
return;
|
||||||
|
|
||||||
|
pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
|
||||||
|
pause += prandom_u32_max(pause);
|
||||||
|
plb->pause_until = tcp_jiffies32 + pause;
|
||||||
|
|
||||||
|
/* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
|
||||||
|
* that may switch this connection to a path with completely different
|
||||||
|
* congestion characteristics.
|
||||||
|
*/
|
||||||
|
plb->consec_cong_rounds = 0;
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
|
Loading…
x
Reference in New Issue
Block a user