Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'
Yury Norov says: ==================== sched: cpumask: improve on cpumask_local_spread() locality cpumask_local_spread() currently checks local node for presence of i'th CPU, and then if it finds nothing makes a flat search among all non-local CPUs. We can do it better by checking CPUs per NUMA hops. This has significant performance implications on NUMA machines, for example when using NUMA-aware allocated memory together with NUMA-aware IRQ affinity hints. Performance tests from patch 8 of this series for mellanox network driver show: TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on). Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121 +-------------------------+-----------+------------------+------------------+ | | BW (Gbps) | TX side CPU util | RX side CPU util | +-------------------------+-----------+------------------+------------------+ | Baseline | 52.3 | 6.4 % | 17.9 % | +-------------------------+-----------+------------------+------------------+ | Applied on TX side only | 52.6 | 5.2 % | 18.5 % | +-------------------------+-----------+------------------+------------------+ | Applied on RX side only | 94.9 | 11.9 % | 27.2 % | +-------------------------+-----------+------------------+------------------+ | Applied on both sides | 95.1 | 8.4 % | 27.3 % | +-------------------------+-----------+------------------+------------------+ Bottleneck in RX side is released, reached linerate (~1.8x speedup). ~30% less cpu util on TX. ==================== Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
cc74ca303a
@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev)
|
||||
static int comp_irqs_request(struct mlx5_core_dev *dev)
|
||||
{
|
||||
struct mlx5_eq_table *table = dev->priv.eq_table;
|
||||
const struct cpumask *prev = cpu_none_mask;
|
||||
const struct cpumask *mask;
|
||||
int ncomp_eqs = table->num_comp_eqs;
|
||||
u16 *cpus;
|
||||
int ret;
|
||||
int cpu;
|
||||
int i;
|
||||
|
||||
ncomp_eqs = table->num_comp_eqs;
|
||||
@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
|
||||
ret = -ENOMEM;
|
||||
goto free_irqs;
|
||||
}
|
||||
for (i = 0; i < ncomp_eqs; i++)
|
||||
cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
|
||||
|
||||
i = 0;
|
||||
rcu_read_lock();
|
||||
for_each_numa_hop_mask(mask, dev->priv.numa_node) {
|
||||
for_each_cpu_andnot(cpu, mask, prev) {
|
||||
cpus[i] = cpu;
|
||||
if (++i == ncomp_eqs)
|
||||
goto spread_done;
|
||||
}
|
||||
prev = mask;
|
||||
}
|
||||
spread_done:
|
||||
rcu_read_unlock();
|
||||
ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
|
||||
kfree(cpus);
|
||||
if (ret < 0)
|
||||
|
@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
|
||||
nr_cpumask_bits, cpumask_check(cpu));
|
||||
}
|
||||
|
||||
/**
|
||||
* cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
|
||||
* @srcp1: the cpumask pointer
|
||||
* @srcp2: the cpumask pointer
|
||||
* @srcp3: the cpumask pointer
|
||||
* @cpu: the N'th cpu to find, starting from 0
|
||||
*
|
||||
* Returns >= nr_cpu_ids if such cpu doesn't exist.
|
||||
*/
|
||||
static __always_inline
|
||||
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
|
||||
const struct cpumask *srcp2,
|
||||
const struct cpumask *srcp3)
|
||||
{
|
||||
return find_nth_and_andnot_bit(cpumask_bits(srcp1),
|
||||
cpumask_bits(srcp2),
|
||||
cpumask_bits(srcp3),
|
||||
nr_cpumask_bits, cpumask_check(cpu));
|
||||
}
|
||||
|
||||
#define CPU_BITS_NONE \
|
||||
{ \
|
||||
[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \
|
||||
|
@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long
|
||||
unsigned long size, unsigned long n);
|
||||
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
|
||||
unsigned long size, unsigned long n);
|
||||
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
|
||||
const unsigned long *addr3, unsigned long size,
|
||||
unsigned long n);
|
||||
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
|
||||
const unsigned long *addr2, unsigned long size);
|
||||
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
|
||||
@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
|
||||
return __find_nth_andnot_bit(addr1, addr2, size, n);
|
||||
}
|
||||
|
||||
/**
|
||||
* find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
|
||||
* excluding those set in 3rd region
|
||||
* @addr1: The 1st address to start the search at
|
||||
* @addr2: The 2nd address to start the search at
|
||||
* @addr3: The 3rd address to start the search at
|
||||
* @size: The maximum number of bits to search
|
||||
* @n: The number of set bit, which position is needed, counting from 0
|
||||
*
|
||||
* Returns the bit number of the N'th set bit.
|
||||
* If no such, returns @size.
|
||||
*/
|
||||
static __always_inline
|
||||
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
|
||||
const unsigned long *addr2,
|
||||
const unsigned long *addr3,
|
||||
unsigned long size, unsigned long n)
|
||||
{
|
||||
if (n >= size)
|
||||
return size;
|
||||
|
||||
if (small_const_nbits(size)) {
|
||||
unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);
|
||||
|
||||
return val ? fns(val, n) : size;
|
||||
}
|
||||
|
||||
return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
|
||||
}
|
||||
|
||||
#ifndef find_first_and_bit
|
||||
/**
|
||||
* find_first_and_bit - find the first set bit in both memory regions
|
||||
|
@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
|
||||
return cpumask_of_node(cpu_to_node(cpu));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
|
||||
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
|
||||
#else
|
||||
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
|
||||
{
|
||||
return cpumask_nth(cpu, cpus);
|
||||
}
|
||||
|
||||
static inline const struct cpumask *
|
||||
sched_numa_hop_mask(unsigned int node, unsigned int hops)
|
||||
{
|
||||
return ERR_PTR(-EOPNOTSUPP);
|
||||
}
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
/**
|
||||
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
|
||||
* from a given node.
|
||||
* @mask: the iteration variable.
|
||||
* @node: the NUMA node to start the search from.
|
||||
*
|
||||
* Requires rcu_lock to be held.
|
||||
*
|
||||
* Yields cpu_online_mask for @node == NUMA_NO_NODE.
|
||||
*/
|
||||
#define for_each_numa_hop_mask(mask, node) \
|
||||
for (unsigned int __hops = 0; \
|
||||
mask = (node != NUMA_NO_NODE || __hops) ? \
|
||||
sched_numa_hop_mask(node, __hops) : \
|
||||
cpu_online_mask, \
|
||||
!IS_ERR_OR_NULL(mask); \
|
||||
__hops++)
|
||||
|
||||
#endif /* _LINUX_TOPOLOGY_H */
|
||||
|
@ -3,6 +3,8 @@
|
||||
* Scheduler topology setup/handling methods
|
||||
*/
|
||||
|
||||
#include <linux/bsearch.h>
|
||||
|
||||
DEFINE_MUTEX(sched_domains_mutex);
|
||||
|
||||
/* Protected by sched_domains_mutex: */
|
||||
@ -2067,6 +2069,94 @@ unlock:
|
||||
return found;
|
||||
}
|
||||
|
||||
struct __cmp_key {
|
||||
const struct cpumask *cpus;
|
||||
struct cpumask ***masks;
|
||||
int node;
|
||||
int cpu;
|
||||
int w;
|
||||
};
|
||||
|
||||
static int hop_cmp(const void *a, const void *b)
|
||||
{
|
||||
struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
|
||||
struct cpumask **cur_hop = *(struct cpumask ***)b;
|
||||
struct __cmp_key *k = (struct __cmp_key *)a;
|
||||
|
||||
if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
|
||||
return 1;
|
||||
|
||||
k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
|
||||
if (k->w <= k->cpu)
|
||||
return 0;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
|
||||
* closest to @cpu from @cpumask.
|
||||
* cpumask: cpumask to find a cpu from
|
||||
* cpu: Nth cpu to find
|
||||
*
|
||||
* returns: cpu, or nr_cpu_ids when nothing found.
|
||||
*/
|
||||
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
|
||||
{
|
||||
struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
|
||||
struct cpumask ***hop_masks;
|
||||
int hop, ret = nr_cpu_ids;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
k.masks = rcu_dereference(sched_domains_numa_masks);
|
||||
if (!k.masks)
|
||||
goto unlock;
|
||||
|
||||
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
|
||||
hop = hop_masks - k.masks;
|
||||
|
||||
ret = hop ?
|
||||
cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
|
||||
cpumask_nth_and(cpu, cpus, k.masks[0][node]);
|
||||
unlock:
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
|
||||
|
||||
/**
|
||||
* sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
|
||||
* @node
|
||||
* @node: The node to count hops from.
|
||||
* @hops: Include CPUs up to that many hops away. 0 means local node.
|
||||
*
|
||||
* Return: On success, a pointer to a cpumask of CPUs at most @hops away from
|
||||
* @node, an error value otherwise.
|
||||
*
|
||||
* Requires rcu_lock to be held. Returned cpumask is only valid within that
|
||||
* read-side section, copy it if required beyond that.
|
||||
*
|
||||
* Note that not all hops are equal in distance; see sched_init_numa() for how
|
||||
* distances and masks are handled.
|
||||
* Also note that this is a reflection of sched_domains_numa_masks, which may change
|
||||
* during the lifetime of the system (offline nodes are taken out of the masks).
|
||||
*/
|
||||
const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
|
||||
{
|
||||
struct cpumask ***masks;
|
||||
|
||||
if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
masks = rcu_dereference(sched_domains_numa_masks);
|
||||
if (!masks)
|
||||
return ERR_PTR(-EBUSY);
|
||||
|
||||
return masks[hops][node];
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
|
||||
|
||||
#endif /* CONFIG_NUMA */
|
||||
|
||||
static int __sdt_alloc(const struct cpumask *cpu_map)
|
||||
|
@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* cpumask_local_spread - select the i'th cpu with local numa cpu's first
|
||||
* cpumask_local_spread - select the i'th cpu based on NUMA distances
|
||||
* @i: index number
|
||||
* @node: local numa_node
|
||||
*
|
||||
* This function selects an online CPU according to a numa aware policy;
|
||||
* local cpus are returned first, followed by non-local ones, then it
|
||||
* wraps around.
|
||||
* Returns online CPU according to a numa aware policy; local cpus are returned
|
||||
* first, followed by non-local ones, then it wraps around.
|
||||
*
|
||||
* It's not very efficient, but useful for setup.
|
||||
* For those who wants to enumerate all CPUs based on their NUMA distances,
|
||||
* i.e. call this function in a loop, like:
|
||||
*
|
||||
* for (i = 0; i < num_online_cpus(); i++) {
|
||||
* cpu = cpumask_local_spread(i, node);
|
||||
* do_something(cpu);
|
||||
* }
|
||||
*
|
||||
* There's a better alternative based on for_each()-like iterators:
|
||||
*
|
||||
* for_each_numa_hop_mask(mask, node) {
|
||||
* for_each_cpu_andnot(cpu, mask, prev)
|
||||
* do_something(cpu);
|
||||
* prev = mask;
|
||||
* }
|
||||
*
|
||||
* It's simpler and more verbose than above. Complexity of iterator-based
|
||||
* enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while
|
||||
* cpumask_local_spread() when called for each cpu is
|
||||
* O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)).
|
||||
*/
|
||||
unsigned int cpumask_local_spread(unsigned int i, int node)
|
||||
{
|
||||
@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
|
||||
/* Wrap: we always want a cpu. */
|
||||
i %= num_online_cpus();
|
||||
|
||||
if (node == NUMA_NO_NODE) {
|
||||
cpu = cpumask_nth(i, cpu_online_mask);
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
} else {
|
||||
/* NUMA first. */
|
||||
cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node));
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
cpu = (node == NUMA_NO_NODE) ?
|
||||
cpumask_nth(i, cpu_online_mask) :
|
||||
sched_numa_find_nth_cpu(cpu_online_mask, i, node);
|
||||
|
||||
i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node));
|
||||
|
||||
/* Skip NUMA nodes, done above. */
|
||||
cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node));
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
}
|
||||
BUG();
|
||||
WARN_ON(cpu >= nr_cpu_ids);
|
||||
return cpu;
|
||||
}
|
||||
EXPORT_SYMBOL(cpumask_local_spread);
|
||||
|
||||
|
@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l
|
||||
}
|
||||
EXPORT_SYMBOL(__find_nth_andnot_bit);
|
||||
|
||||
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1,
|
||||
const unsigned long *addr2,
|
||||
const unsigned long *addr3,
|
||||
unsigned long size, unsigned long n)
|
||||
{
|
||||
return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n);
|
||||
}
|
||||
EXPORT_SYMBOL(__find_nth_and_andnot_bit);
|
||||
|
||||
#ifndef find_next_and_bit
|
||||
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
|
||||
unsigned long nbits, unsigned long start)
|
||||
|
Loading…
x
Reference in New Issue
Block a user