Merge master.kernel.org:/pub/scm/linux/kernel/git/acme/net-2.6

This commit is contained in:
David S. Miller 2006-12-03 19:24:40 -08:00
commit 83ac58ba0a
4 changed files with 237 additions and 139 deletions

View File

@ -89,4 +89,37 @@ config IP_DCCP_CCID3_DEBUG
parameter to 0 or 1.
If in doubt, say N.
config IP_DCCP_CCID3_RTO
int "Use higher bound for nofeedback timer"
default 100
depends on IP_DCCP_CCID3 && EXPERIMENTAL
---help---
Use higher lower bound for nofeedback timer expiration.
The TFRC nofeedback timer normally expires after the maximum of 4
RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
with a small RTT this can mean a high processing load and reduced
performance, since then the nofeedback timer is triggered very
frequently.
This option enables to set a higher lower bound for the nofeedback
value. Values in units of milliseconds can be set here.
A value of 0 disables this feature by enforcing the value specified
in RFC 3448. The following values have been suggested as bounds for
experimental use:
* 16-20ms to match the typical multimedia inter-frame interval
* 100ms as a reasonable compromise [default]
* 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
The default of 100ms is a compromise between a large value for
efficient DCCP implementations, and a small value to avoid disrupting
the network in times of congestion.
The purpose of the nofeedback timer is to slow DCCP down when there
is serious network congestion: experimenting with larger values should
therefore not be performed on WANs.
endmenu

View File

@ -121,12 +121,15 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx)
/*
* Update X by
* If (p > 0)
* x_calc = calcX(s, R, p);
* X_calc = calcX(s, R, p);
* X = max(min(X_calc, 2 * X_recv), s / t_mbi);
* Else
* If (now - tld >= R)
* X = max(min(2 * X, 2 * X_recv), s / R);
* tld = now;
*
* If X has changed, we also update the scheduled send time t_now,
* the inter-packet interval t_ipi, and the delta value.
*/
static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
@ -134,8 +137,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
const __u32 old_x = hctx->ccid3hctx_x;
/* To avoid large error in calcX */
if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
if (hctx->ccid3hctx_p > 0) {
hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
hctx->ccid3hctx_rtt,
hctx->ccid3hctx_p);
@ -223,16 +225,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
ccid3_tx_state_name(hctx->ccid3hctx_state));
/* Halve sending rate */
/* If (X_calc > 2 * X_recv)
/* If (p == 0 || X_calc > 2 * X_recv)
* X_recv = max(X_recv / 2, s / (2 * t_mbi));
* Else
* X_recv = X_calc / 4;
*/
BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
hctx->ccid3hctx_x_calc == 0);
BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
/* check also if p is zero -> x_calc is infinity? */
if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
if (hctx->ccid3hctx_p == 0 ||
hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
hctx->ccid3hctx_s / (2 * TFRC_T_MBI));
@ -245,9 +245,10 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
}
/*
* Schedule no feedback timer to expire in
* max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi)
* max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
* See comments in packet_recv() regarding the value of t_RTO.
*/
t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi);
t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
break;
case TFRC_SSTATE_NO_SENT:
DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk);
@ -338,7 +339,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
* else
* // send the packet in (t_nom - t_now) milliseconds.
*/
if (delay >= hctx->ccid3hctx_delta)
if (delay - (long)hctx->ccid3hctx_delta >= 0)
return delay / 1000L;
break;
case TFRC_SSTATE_TERM:
@ -412,10 +413,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
struct dccp_tx_hist_entry *packet;
struct timeval now;
unsigned long t_nfb;
u32 t_elapsed;
u32 pinv;
u32 x_recv;
u32 r_sample;
long r_sample, t_elapsed;
BUG_ON(hctx == NULL);
@ -426,31 +425,44 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
opt_recv = &hctx->ccid3hctx_options_received;
t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
x_recv = opt_recv->ccid3or_receive_rate;
pinv = opt_recv->ccid3or_loss_event_rate;
switch (hctx->ccid3hctx_state) {
case TFRC_SSTATE_NO_FBACK:
case TFRC_SSTATE_FBACK:
/* Calculate new round trip sample by
* R_sample = (now - t_recvdata) - t_delay */
/* get t_recvdata from history */
/* get packet from history to look up t_recvdata */
packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
DCCP_SKB_CB(skb)->dccpd_ack_seq);
if (unlikely(packet == NULL)) {
DCCP_WARN("%s, sk=%p, seqno %llu(%s) does't exist "
DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist "
"in history!\n", dccp_role(sk), sk,
(unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
return;
}
/* Update RTT */
/* Update receive rate */
hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
/* Update loss event rate */
pinv = opt_recv->ccid3or_loss_event_rate;
if (pinv == ~0U || pinv == 0)
hctx->ccid3hctx_p = 0;
else
hctx->ccid3hctx_p = 1000000 / pinv;
dccp_timestamp(sk, &now);
/*
* Calculate new round trip sample as per [RFC 3448, 4.3] by
* R_sample = (now - t_recvdata) - t_elapsed
*/
r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
if (unlikely(r_sample <= t_elapsed))
DCCP_WARN("r_sample=%uus,t_elapsed=%uus\n",
t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
if (unlikely(r_sample <= 0)) {
DCCP_WARN("WARNING: R_sample (%ld) <= 0!\n", r_sample);
r_sample = 0;
} else if (unlikely(r_sample <= t_elapsed))
DCCP_WARN("WARNING: r_sample=%ldus <= t_elapsed=%ldus\n",
r_sample, t_elapsed);
else
r_sample -= t_elapsed;
@ -473,31 +485,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
hctx->ccid3hctx_t_ld = now;
ccid3_update_send_time(hctx);
ccid3_pr_debug("%s(%p), s=%u, w_init=%u, "
"R_sample=%ldus, X=%u\n", dccp_role(sk),
sk, hctx->ccid3hctx_s, w_init, r_sample,
hctx->ccid3hctx_x);
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
} else {
hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
r_sample / 10;
hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt +
(u32)r_sample ) / 10;
ccid3_hc_tx_update_x(sk, &now);
}
ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
"r_sample=%us\n", dccp_role(sk), sk,
hctx->ccid3hctx_rtt, r_sample);
/* Update receive rate */
hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
/* Update loss event rate */
if (pinv == ~0 || pinv == 0)
hctx->ccid3hctx_p = 0;
else {
hctx->ccid3hctx_p = 1000000 / pinv;
if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
hctx->ccid3hctx_p = TFRC_SMALLEST_P;
ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
dccp_role(sk), sk);
}
ccid3_pr_debug("%s(%p), RTT=%uus (sample=%ldus), s=%u, "
"p=%u, X_calc=%u, X=%u\n", dccp_role(sk),
sk, hctx->ccid3hctx_rtt, r_sample,
hctx->ccid3hctx_s, hctx->ccid3hctx_p,
hctx->ccid3hctx_x_calc,
hctx->ccid3hctx_x);
}
/* unschedule no feedback timer */
@ -512,16 +518,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
*/
sk->sk_write_space(sk);
/* Update timeout interval. We use the alternative variant of
* [RFC 3448, 3.1] which sets the upper bound of t_rto to one
* second, as it is suggested for TCP (see RFC 2988, 2.4). */
/*
* Update timeout interval for the nofeedback timer.
* We use a configuration option to increase the lower bound.
* This can help avoid triggering the nofeedback timer too often
* ('spinning') on LANs with small RTTs.
*/
hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
USEC_PER_SEC );
CONFIG_IP_DCCP_CCID3_RTO *
(USEC_PER_SEC/1000) );
/*
* Schedule no feedback timer to expire in
* max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi)
* max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
*/
t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi);
t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
"expire in %lu jiffies (%luus)\n",
@ -535,7 +545,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
hctx->ccid3hctx_idle = 1;
break;
case TFRC_SSTATE_NO_SENT:
DCCP_WARN("Illegal ACK received - no packet has been sent\n");
if (dccp_sk(sk)->dccps_role == DCCP_ROLE_CLIENT)
DCCP_WARN("Illegal ACK received - no packet sent\n");
/* fall through */
case TFRC_SSTATE_TERM: /* ignore feedback when closing */
break;

View File

@ -51,8 +51,6 @@
/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
#define TFRC_T_MBI 64
#define TFRC_SMALLEST_P 40
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
TFRC_OPT_LOSS_INTERVALS = 193,

View File

@ -18,10 +18,79 @@
#include "tfrc.h"
#define TFRC_CALC_X_ARRSIZE 500
#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
#define TFRC_CALC_X_SPLIT 50000
/* equivalent to 0.05 */
/*
TFRC TCP Reno Throughput Equation Lookup Table for f(p)
The following two-column lookup table implements a part of the TCP throughput
equation from [RFC 3448, sec. 3.1]:
s
X_calc = --------------------------------------------------------------
R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
Where:
X is the transmit rate in bytes/second
s is the packet size in bytes
R is the round trip time in seconds
p is the loss event rate, between 0 and 1.0, of the number of loss
events as a fraction of the number of packets transmitted
t_RTO is the TCP retransmission timeout value in seconds
b is the number of packets acknowledged by a single TCP ACK
We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
s
X_calc = -------------------------------------------------------
R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
which we can break down into:
s
X_calc = ---------
R * f(p)
where f(p) is given for 0 < p <= 1 by:
f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
Since this is kernel code, floating-point arithmetic is avoided in favour of
integer arithmetic. This means that nearly all fractional parameters are
scaled by 1000000:
* the parameters p and R
* the return result f(p)
The lookup table therefore actually tabulates the following function g(q):
g(q) = 1000000 * f(q/1000000)
Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
granularity for the practically more relevant case of small values of p (up to
5%), the second column is used; the first one ranges up to 100%. This split
corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
determines the smallest resolution possible with this lookup table:
TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
The entire table is generated by:
for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
}
With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
lookup[M][0] = g(1000000) = 1000000 * f(100%)
lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
In summary, the two columns represent f(p) for the following ranges:
* The first column is for 0.002 <= p <= 1.0
* The second column is for 0.0001 <= p <= 0.05
Where the columns overlap, the second (finer-grained) is given preference,
i.e. the first column is used only for p >= 0.05.
*/
static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
{ 37172, 8172 },
{ 53499, 11567 },
@ -525,85 +594,69 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
{ 243315981, 271305 }
};
/* Calculate the send rate as per section 3.1 of RFC3448
/* return largest index i such that fval <= lookup[i][small] */
static inline u32 tfrc_binsearch(u32 fval, u8 small)
{
u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
Returns send rate in bytes per second
while (low < high) {
try = (low + high) / 2;
if (fval <= tfrc_calc_x_lookup[try][small])
high = try;
else
low = try + 1;
}
return high;
}
Integer maths and lookups are used as not allowed floating point in kernel
The function for Xcalc as per section 3.1 of RFC3448 is:
X = s
-------------------------------------------------------------
R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
where
X is the trasmit rate in bytes/second
s is the packet size in bytes
R is the round trip time in seconds
p is the loss event rate, between 0 and 1.0, of the number of loss events
as a fraction of the number of packets transmitted
t_RTO is the TCP retransmission timeout value in seconds
b is the number of packets acknowledged by a single TCP acknowledgement
we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
X = s
-----------------------------------------------------------------------
R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
which we can break down into:
X = s
--------
R * f(p)
where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
Function parameters:
s - bytes
R - RTT in usecs
p - loss rate (decimal fraction multiplied by 1,000,000)
Returns Xcalc in bytes per second
DON'T alter this code unless you run test cases against it as the code
has been manipulated to stop underflow/overlow.
*/
/**
* tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
*
* @s: packet size in bytes
* @R: RTT scaled by 1000000 (i.e., microseconds)
* @p: loss ratio estimate scaled by 1000000
* Returns X_calc in bytes per second (not scaled).
*
* Note: DO NOT alter this code unless you run test cases against it,
* as the code has been optimized to stop underflow/overflow.
*/
u32 tfrc_calc_x(u16 s, u32 R, u32 p)
{
int index;
u32 f;
u64 tmp1, tmp2;
if (p < TFRC_CALC_X_SPLIT)
index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
else
index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
if (index < 0)
/* p should be 0 unless there is a bug in my code */
index = 0;
if (R == 0) {
DCCP_WARN("RTT==0, setting to 1\n");
R = 1; /* RTT can't be zero or else divide by zero */
/* check against invalid parameters and divide-by-zero */
BUG_ON(p > 1000000); /* p must not exceed 100% */
BUG_ON(p == 0); /* f(0) = 0, divide by zero */
if (R == 0) { /* possible divide by zero */
DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
return ~0U;
}
BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
DCCP_WARN("Value of p (%d) below resolution. "
"Substituting %d\n", p, TFRC_SMALLEST_P);
index = 0;
} else /* 0.0001 <= p <= 0.05 */
index = p/TFRC_SMALLEST_P - 1;
if (p >= TFRC_CALC_X_SPLIT)
f = tfrc_calc_x_lookup[index][0];
else
f = tfrc_calc_x_lookup[index][1];
} else { /* 0.05 < p <= 1.00 */
index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
f = tfrc_calc_x_lookup[index][0];
}
/* The following computes X = s/(R*f(p)) in bytes per second. Since f(p)
* and R are both scaled by 1000000, we need to multiply by 1000000^2.
* ==> DO NOT alter this unless you test against overflow on 32 bit */
tmp1 = ((u64)s * 100000000);
tmp2 = ((u64)R * (u64)f);
do_div(tmp2, 10000);
do_div(tmp1, tmp2);
/* Don't alter above math unless you test due to overflow on 32 bit */
return (u32)tmp1;
}
@ -611,33 +664,36 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
EXPORT_SYMBOL_GPL(tfrc_calc_x);
/*
* args: fvalue - function value to match
* returns: p closest to that value
* tfrc_calc_x_reverse_lookup - try to find p given f(p)
*
* both fvalue and p are multiplied by 1,000,000 to use ints
* @fvalue: function value to match, scaled by 1000000
* Returns closest match for p, also scaled by 1000000
*/
u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
{
int ctr = 0;
int small;
int index;
if (fvalue < tfrc_calc_x_lookup[0][1])
if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
return 0;
if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
small = 1;
else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
/* Error cases. */
if (fvalue < tfrc_calc_x_lookup[0][1]) {
DCCP_WARN("fvalue %d smaller than resolution\n", fvalue);
return tfrc_calc_x_lookup[0][1];
}
if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue);
return 1000000;
else
small = 0;
}
while (fvalue > tfrc_calc_x_lookup[ctr][small])
ctr++;
if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
index = tfrc_binsearch(fvalue, 1);
return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
}
if (small)
return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
else
return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
/* else ... it must be in the coarse-grained column */
index = tfrc_binsearch(fvalue, 0);
return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
}
EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);