sched/topology: Improve load balancing on AMD EPYC systems
SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init() for any sched domains with a NUMA distance greater than 2 hops (RECLAIM_DISTANCE). The idea being that it's expensive to balance across domains that far apart. However, as is rather unfortunately explained in: commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30") the value for RECLAIM_DISTANCE is based on node distance tables from 2011-era hardware. Current AMD EPYC machines have the following NUMA node distances: node distances: node 0 1 2 3 4 5 6 7 0: 10 16 16 16 32 32 32 32 1: 16 10 16 16 32 32 32 32 2: 16 16 10 16 32 32 32 32 3: 16 16 16 10 32 32 32 32 4: 32 32 32 32 10 16 16 16 5: 32 32 32 32 16 10 16 16 6: 32 32 32 32 16 16 10 16 7: 32 32 32 32 16 16 16 10 where 2 hops is 32. The result is that the scheduler fails to load balance properly across NUMA nodes on different sockets -- 2 hops apart. For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4 (CPUs 32-39) like so, $ numactl -C 0-7,32-39 ./spinner 16 causes all threads to fork and remain on node 0 until the active balancer kicks in after a few seconds and forcibly moves some threads to node 4. Override node_reclaim_distance for AMD Zen. Signed-off-by: Matt Fleming <matt@codeblueprint.co.uk> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@surriel.com> Cc: Suravee.Suthikulpanit@amd.com Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Thomas.Lendacky@amd.com Cc: Tony Luck <tony.luck@intel.com> Link: https://lkml.kernel.org/r/20190808195301.13222-3-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
a2cbfd4655
commit
a55c7454a8
@ -8,6 +8,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/clock.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/topology.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/apic.h>
|
||||
#include <asm/cacheinfo.h>
|
||||
@ -824,6 +825,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
|
||||
{
|
||||
set_cpu_cap(c, X86_FEATURE_ZEN);
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
node_reclaim_distance = 32;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Fix erratum 1076: CPB feature bit not being set in CPUID.
|
||||
* Always set it, except when running under a hypervisor.
|
||||
|
@ -59,6 +59,20 @@ int arch_update_cpu_topology(void);
|
||||
*/
|
||||
#define RECLAIM_DISTANCE 30
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following tunable allows platforms to override the default node
|
||||
* reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
|
||||
* sufficiently fast that the default value actually hurts
|
||||
* performance.
|
||||
*
|
||||
* AMD EPYC machines use this because even though the 2-hop distance
|
||||
* is 32 (3.2x slower than a local memory access) performance actually
|
||||
* *improves* if allowed to reclaim memory and load balance tasks
|
||||
* between NUMA nodes 2-hops apart.
|
||||
*/
|
||||
extern int __read_mostly node_reclaim_distance;
|
||||
|
||||
#ifndef PENALTY_FOR_NODE_WITH_CPUS
|
||||
#define PENALTY_FOR_NODE_WITH_CPUS (1)
|
||||
#endif
|
||||
|
@ -1284,6 +1284,7 @@ static int sched_domains_curr_level;
|
||||
int sched_max_numa_distance;
|
||||
static int *sched_domains_numa_distance;
|
||||
static struct cpumask ***sched_domains_numa_masks;
|
||||
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
|
||||
#endif
|
||||
|
||||
/*
|
||||
@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
||||
|
||||
sd->flags &= ~SD_PREFER_SIBLING;
|
||||
sd->flags |= SD_SERIALIZE;
|
||||
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
|
||||
if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
|
||||
sd->flags &= ~(SD_BALANCE_EXEC |
|
||||
SD_BALANCE_FORK |
|
||||
SD_WAKE_AFFINE);
|
||||
|
@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid)
|
||||
for (i = 0; i < MAX_NUMNODES; i++) {
|
||||
if (!khugepaged_node_load[i])
|
||||
continue;
|
||||
if (node_distance(nid, i) > RECLAIM_DISTANCE)
|
||||
if (node_distance(nid, i) > node_reclaim_distance)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -3522,7 +3522,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
|
||||
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
|
||||
{
|
||||
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
|
||||
RECLAIM_DISTANCE;
|
||||
node_reclaim_distance;
|
||||
}
|
||||
#else /* CONFIG_NUMA */
|
||||
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
|
||||
|
Loading…
x
Reference in New Issue
Block a user