6025b9135f
softnet_data->time_squeeze is sometimes used as a proxy for host overload or indication of scheduling problems. In practice this statistic is very noisy and has hard to grasp units - e.g. is 10 squeezes a second to be expected, or high? Delaying network (NAPI) processing leads to drops on NIC queues but also RTT bloat, impacting pacing and CA decisions. Stalls are a little hard to detect on the Rx side, because there may simply have not been any packets received in given period of time. Packet timestamps help a little bit, but again we don't know if packets are stale because we're not keeping up or because someone (*cough* cgroups) disabled IRQs for a long time. We can, however, use Tx as a proxy for Rx stalls. Most drivers use combined Rx+Tx NAPIs so if Tx gets starved so will Rx. On the Tx side we know exactly when packets get queued, and completed, so there is no uncertainty. This patch adds stall checks to BQL. Why BQL? Because it's a convenient place to add such checks, already called by most drivers, and it has copious free space in its structures (this patch adds no extra cache references or dirtying to the fast path). The algorithm takes one parameter - max delay AKA stall threshold and increments a counter whenever NAPI got delayed for at least that amount of time. It also records the length of the longest stall. To be precise every time NAPI has not polled for at least stall thrs we check if there were any Tx packets queued between last NAPI run and now - stall_thrs/2. Unlike the classic Tx watchdog this mechanism does not ignore stalls caused by Tx being disabled, or loss of link. I don't think the check is worth the complexity, and stall is a stall, whether due to host overload, flow control, link down... doesn't matter much to the application. We have been running this detector in production at Meta for 2 years, with the threshold of 8ms. It's the lowest value where false positives become rare. There's still a constant stream of reported stalls (especially without the ksoftirqd deferral patches reverted), those who like their stall metrics to be 0 may prefer higher value. Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Breno Leitao <leitao@debian.org> Signed-off-by: David S. Miller <davem@davemloft.net>
154 lines
5.2 KiB
C
154 lines
5.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Dynamic queue limits (dql) - Definitions
|
|
*
|
|
* Copyright (c) 2011, Tom Herbert <therbert@google.com>
|
|
*
|
|
* This header file contains the definitions for dynamic queue limits (dql).
|
|
* dql would be used in conjunction with a producer/consumer type queue
|
|
* (possibly a HW queue). Such a queue would have these general properties:
|
|
*
|
|
* 1) Objects are queued up to some limit specified as number of objects.
|
|
* 2) Periodically a completion process executes which retires consumed
|
|
* objects.
|
|
* 3) Starvation occurs when limit has been reached, all queued data has
|
|
* actually been consumed, but completion processing has not yet run
|
|
* so queuing new data is blocked.
|
|
* 4) Minimizing the amount of queued data is desirable.
|
|
*
|
|
* The goal of dql is to calculate the limit as the minimum number of objects
|
|
* needed to prevent starvation.
|
|
*
|
|
* The primary functions of dql are:
|
|
* dql_queued - called when objects are enqueued to record number of objects
|
|
* dql_avail - returns how many objects are available to be queued based
|
|
* on the object limit and how many objects are already enqueued
|
|
* dql_completed - called at completion time to indicate how many objects
|
|
* were retired from the queue
|
|
*
|
|
* The dql implementation does not implement any locking for the dql data
|
|
* structures, the higher layer should provide this. dql_queued should
|
|
* be serialized to prevent concurrent execution of the function; this
|
|
* is also true for dql_completed. However, dql_queued and dlq_completed can
|
|
* be executed concurrently (i.e. they can be protected by different locks).
|
|
*/
|
|
|
|
#ifndef _LINUX_DQL_H
|
|
#define _LINUX_DQL_H
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
#include <linux/bitops.h>
|
|
#include <asm/bug.h>
|
|
|
|
#define DQL_HIST_LEN 4
|
|
#define DQL_HIST_ENT(dql, idx) ((dql)->history[(idx) % DQL_HIST_LEN])
|
|
|
|
struct dql {
|
|
/* Fields accessed in enqueue path (dql_queued) */
|
|
unsigned int num_queued; /* Total ever queued */
|
|
unsigned int adj_limit; /* limit + num_completed */
|
|
unsigned int last_obj_cnt; /* Count at last queuing */
|
|
|
|
unsigned long history_head; /* top 58 bits of jiffies */
|
|
/* stall entries, a bit per entry */
|
|
unsigned long history[DQL_HIST_LEN];
|
|
|
|
/* Fields accessed only by completion path (dql_completed) */
|
|
|
|
unsigned int limit ____cacheline_aligned_in_smp; /* Current limit */
|
|
unsigned int num_completed; /* Total ever completed */
|
|
|
|
unsigned int prev_ovlimit; /* Previous over limit */
|
|
unsigned int prev_num_queued; /* Previous queue total */
|
|
unsigned int prev_last_obj_cnt; /* Previous queuing cnt */
|
|
|
|
unsigned int lowest_slack; /* Lowest slack found */
|
|
unsigned long slack_start_time; /* Time slacks seen */
|
|
|
|
/* Configuration */
|
|
unsigned int max_limit; /* Max limit */
|
|
unsigned int min_limit; /* Minimum limit */
|
|
unsigned int slack_hold_time; /* Time to measure slack */
|
|
|
|
/* Stall threshold (in jiffies), defined by user */
|
|
unsigned short stall_thrs;
|
|
/* Longest stall detected, reported to user */
|
|
unsigned short stall_max;
|
|
unsigned long last_reap; /* Last reap (in jiffies) */
|
|
unsigned long stall_cnt; /* Number of stalls */
|
|
};
|
|
|
|
/* Set some static maximums */
|
|
#define DQL_MAX_OBJECT (UINT_MAX / 16)
|
|
#define DQL_MAX_LIMIT ((UINT_MAX / 2) - DQL_MAX_OBJECT)
|
|
|
|
/*
|
|
* Record number of objects queued. Assumes that caller has already checked
|
|
* availability in the queue with dql_avail.
|
|
*/
|
|
static inline void dql_queued(struct dql *dql, unsigned int count)
|
|
{
|
|
unsigned long map, now, now_hi, i;
|
|
|
|
BUG_ON(count > DQL_MAX_OBJECT);
|
|
|
|
dql->last_obj_cnt = count;
|
|
|
|
/* We want to force a write first, so that cpu do not attempt
|
|
* to get cache line containing last_obj_cnt, num_queued, adj_limit
|
|
* in Shared state, but directly does a Request For Ownership
|
|
* It is only a hint, we use barrier() only.
|
|
*/
|
|
barrier();
|
|
|
|
dql->num_queued += count;
|
|
|
|
now = jiffies;
|
|
now_hi = now / BITS_PER_LONG;
|
|
|
|
/* The following code set a bit in the ring buffer, where each
|
|
* bit trackes time the packet was queued. The dql->history buffer
|
|
* tracks DQL_HIST_LEN * BITS_PER_LONG time (jiffies) slot
|
|
*/
|
|
if (unlikely(now_hi != dql->history_head)) {
|
|
/* About to reuse slots, clear them */
|
|
for (i = 0; i < DQL_HIST_LEN; i++) {
|
|
/* Multiplication masks high bits */
|
|
if (now_hi * BITS_PER_LONG ==
|
|
(dql->history_head + i) * BITS_PER_LONG)
|
|
break;
|
|
DQL_HIST_ENT(dql, dql->history_head + i + 1) = 0;
|
|
}
|
|
/* pairs with smp_rmb() in dql_check_stall() */
|
|
smp_wmb();
|
|
WRITE_ONCE(dql->history_head, now_hi);
|
|
}
|
|
|
|
/* __set_bit() does not guarantee WRITE_ONCE() semantics */
|
|
map = DQL_HIST_ENT(dql, now_hi);
|
|
|
|
/* Populate the history with an entry (bit) per queued */
|
|
if (!(map & BIT_MASK(now)))
|
|
WRITE_ONCE(DQL_HIST_ENT(dql, now_hi), map | BIT_MASK(now));
|
|
}
|
|
|
|
/* Returns how many objects can be queued, < 0 indicates over limit. */
|
|
static inline int dql_avail(const struct dql *dql)
|
|
{
|
|
return READ_ONCE(dql->adj_limit) - READ_ONCE(dql->num_queued);
|
|
}
|
|
|
|
/* Record number of completed objects and recalculate the limit. */
|
|
void dql_completed(struct dql *dql, unsigned int count);
|
|
|
|
/* Reset dql state */
|
|
void dql_reset(struct dql *dql);
|
|
|
|
/* Initialize dql state */
|
|
void dql_init(struct dql *dql, unsigned int hold_time);
|
|
|
|
#endif /* _KERNEL_ */
|
|
|
|
#endif /* _LINUX_DQL_H */
|