sched/fair: Rewrite group_imb trigger
Change the group_imb detection from the old 'load-spike' detector to an actual imbalance detector. We set it from the lower domain balance pass when it fails to create a balance in the presence of task affinities. The advantage is that this should no longer generate the false positive group_imb conditions generated by transient load spikes from the normal balancing/bulk-wakeup etc. behaviour. While I haven't actually observed those they could happen. I'm not entirely happy with this patch; it somehow feels a little fragile. Nor does it solve the biggest issue I have with the group_imb code; it it still a fragile construct in that once we 'fixed' the imbalance we'll not detect the group_imb again and could end up re-creating it. That said, this patch does seem to preserve behaviour for the described degenerate case. In particular on my 2*6*2 wsm-ep: taskset -c 3-11 bash -c 'for ((i=0;i<9;i++)) do while :; do :; done & done' ends up with 9 spinners, each on their own CPU; whereas if you disable the group_imb code that typically doesn't happen (you'll get one pair sharing a CPU most of the time). Signed-off-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/n/tip-36fpbgl39dv4u51b6yz2ypz5@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
3b524d6094
commit
6263322c5e
@ -3906,7 +3906,8 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
|||||||
|
|
||||||
#define LBF_ALL_PINNED 0x01
|
#define LBF_ALL_PINNED 0x01
|
||||||
#define LBF_NEED_BREAK 0x02
|
#define LBF_NEED_BREAK 0x02
|
||||||
#define LBF_SOME_PINNED 0x04
|
#define LBF_DST_PINNED 0x04
|
||||||
|
#define LBF_SOME_PINNED 0x08
|
||||||
|
|
||||||
struct lb_env {
|
struct lb_env {
|
||||||
struct sched_domain *sd;
|
struct sched_domain *sd;
|
||||||
@ -3997,6 +3998,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||||||
|
|
||||||
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
|
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
|
||||||
|
|
||||||
|
env->flags |= LBF_SOME_PINNED;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Remember if this task can be migrated to any other cpu in
|
* Remember if this task can be migrated to any other cpu in
|
||||||
* our sched_group. We may want to revisit it if we couldn't
|
* our sched_group. We may want to revisit it if we couldn't
|
||||||
@ -4005,13 +4008,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||||||
* Also avoid computing new_dst_cpu if we have already computed
|
* Also avoid computing new_dst_cpu if we have already computed
|
||||||
* one in current iteration.
|
* one in current iteration.
|
||||||
*/
|
*/
|
||||||
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
|
if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/* Prevent to re-select dst_cpu via env's cpus */
|
/* Prevent to re-select dst_cpu via env's cpus */
|
||||||
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
|
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
|
||||||
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
|
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
|
||||||
env->flags |= LBF_SOME_PINNED;
|
env->flags |= LBF_DST_PINNED;
|
||||||
env->new_dst_cpu = cpu;
|
env->new_dst_cpu = cpu;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -4526,13 +4529,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
|
|||||||
* cpu 3 and leave one of the cpus in the second group unused.
|
* cpu 3 and leave one of the cpus in the second group unused.
|
||||||
*
|
*
|
||||||
* The current solution to this issue is detecting the skew in the first group
|
* The current solution to this issue is detecting the skew in the first group
|
||||||
* by noticing it has a cpu that is overloaded while the remaining cpus are
|
* by noticing the lower domain failed to reach balance and had difficulty
|
||||||
* idle -- or rather, there's a distinct imbalance in the cpus; see
|
* moving tasks due to affinity constraints.
|
||||||
* sg_imbalanced().
|
|
||||||
*
|
*
|
||||||
* When this is so detected; this group becomes a candidate for busiest; see
|
* When this is so detected; this group becomes a candidate for busiest; see
|
||||||
* update_sd_pick_busiest(). And calculcate_imbalance() and
|
* update_sd_pick_busiest(). And calculcate_imbalance() and
|
||||||
* find_busiest_group() avoid some of the usual balance conditional to allow it
|
* find_busiest_group() avoid some of the usual balance conditions to allow it
|
||||||
* to create an effective group imbalance.
|
* to create an effective group imbalance.
|
||||||
*
|
*
|
||||||
* This is a somewhat tricky proposition since the next run might not find the
|
* This is a somewhat tricky proposition since the next run might not find the
|
||||||
@ -4540,49 +4542,9 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
|
|||||||
* subtle and fragile situation.
|
* subtle and fragile situation.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct sg_imb_stats {
|
static inline int sg_imbalanced(struct sched_group *group)
|
||||||
unsigned long max_nr_running, min_nr_running;
|
|
||||||
unsigned long max_cpu_load, min_cpu_load;
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
|
|
||||||
{
|
{
|
||||||
sgi->max_cpu_load = sgi->max_nr_running = 0UL;
|
return group->sgp->imbalance;
|
||||||
sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void
|
|
||||||
update_sg_imb_stats(struct sg_imb_stats *sgi,
|
|
||||||
unsigned long load, unsigned long nr_running)
|
|
||||||
{
|
|
||||||
if (load > sgi->max_cpu_load)
|
|
||||||
sgi->max_cpu_load = load;
|
|
||||||
if (sgi->min_cpu_load > load)
|
|
||||||
sgi->min_cpu_load = load;
|
|
||||||
|
|
||||||
if (nr_running > sgi->max_nr_running)
|
|
||||||
sgi->max_nr_running = nr_running;
|
|
||||||
if (sgi->min_nr_running > nr_running)
|
|
||||||
sgi->min_nr_running = nr_running;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline int
|
|
||||||
sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* Consider the group unbalanced when the imbalance is larger
|
|
||||||
* than the average weight of a task.
|
|
||||||
*
|
|
||||||
* APZ: with cgroup the avg task weight can vary wildly and
|
|
||||||
* might not be a suitable number - should we keep a
|
|
||||||
* normalized nr_running number somewhere that negates
|
|
||||||
* the hierarchy?
|
|
||||||
*/
|
|
||||||
if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
|
|
||||||
(sgi->max_nr_running - sgi->min_nr_running) > 1)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -4597,25 +4559,20 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|||||||
struct sched_group *group, int load_idx,
|
struct sched_group *group, int load_idx,
|
||||||
int local_group, struct sg_lb_stats *sgs)
|
int local_group, struct sg_lb_stats *sgs)
|
||||||
{
|
{
|
||||||
struct sg_imb_stats sgi;
|
|
||||||
unsigned long nr_running;
|
unsigned long nr_running;
|
||||||
unsigned long load;
|
unsigned long load;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
init_sg_imb_stats(&sgi);
|
|
||||||
|
|
||||||
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
|
||||||
struct rq *rq = cpu_rq(i);
|
struct rq *rq = cpu_rq(i);
|
||||||
|
|
||||||
nr_running = rq->nr_running;
|
nr_running = rq->nr_running;
|
||||||
|
|
||||||
/* Bias balancing toward cpus of our domain */
|
/* Bias balancing toward cpus of our domain */
|
||||||
if (local_group) {
|
if (local_group)
|
||||||
load = target_load(i, load_idx);
|
load = target_load(i, load_idx);
|
||||||
} else {
|
else
|
||||||
load = source_load(i, load_idx);
|
load = source_load(i, load_idx);
|
||||||
update_sg_imb_stats(&sgi, load, nr_running);
|
|
||||||
}
|
|
||||||
|
|
||||||
sgs->group_load += load;
|
sgs->group_load += load;
|
||||||
sgs->sum_nr_running += nr_running;
|
sgs->sum_nr_running += nr_running;
|
||||||
@ -4635,7 +4592,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|||||||
if (sgs->sum_nr_running)
|
if (sgs->sum_nr_running)
|
||||||
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
||||||
|
|
||||||
sgs->group_imb = sg_imbalanced(sgs, &sgi);
|
sgs->group_imb = sg_imbalanced(group);
|
||||||
|
|
||||||
sgs->group_capacity =
|
sgs->group_capacity =
|
||||||
DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
|
DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
|
||||||
@ -5163,6 +5120,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
|
|||||||
int *continue_balancing)
|
int *continue_balancing)
|
||||||
{
|
{
|
||||||
int ld_moved, cur_ld_moved, active_balance = 0;
|
int ld_moved, cur_ld_moved, active_balance = 0;
|
||||||
|
struct sched_domain *sd_parent = sd->parent;
|
||||||
struct sched_group *group;
|
struct sched_group *group;
|
||||||
struct rq *busiest;
|
struct rq *busiest;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
@ -5267,11 +5225,11 @@ more_balance:
|
|||||||
* moreover subsequent load balance cycles should correct the
|
* moreover subsequent load balance cycles should correct the
|
||||||
* excess load moved.
|
* excess load moved.
|
||||||
*/
|
*/
|
||||||
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
|
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
|
||||||
|
|
||||||
env.dst_rq = cpu_rq(env.new_dst_cpu);
|
env.dst_rq = cpu_rq(env.new_dst_cpu);
|
||||||
env.dst_cpu = env.new_dst_cpu;
|
env.dst_cpu = env.new_dst_cpu;
|
||||||
env.flags &= ~LBF_SOME_PINNED;
|
env.flags &= ~LBF_DST_PINNED;
|
||||||
env.loop = 0;
|
env.loop = 0;
|
||||||
env.loop_break = sched_nr_migrate_break;
|
env.loop_break = sched_nr_migrate_break;
|
||||||
|
|
||||||
@ -5285,6 +5243,18 @@ more_balance:
|
|||||||
goto more_balance;
|
goto more_balance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We failed to reach balance because of affinity.
|
||||||
|
*/
|
||||||
|
if (sd_parent) {
|
||||||
|
int *group_imbalance = &sd_parent->groups->sgp->imbalance;
|
||||||
|
|
||||||
|
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
|
||||||
|
*group_imbalance = 1;
|
||||||
|
} else if (*group_imbalance)
|
||||||
|
*group_imbalance = 0;
|
||||||
|
}
|
||||||
|
|
||||||
/* All tasks on this runqueue were pinned by CPU affinity */
|
/* All tasks on this runqueue were pinned by CPU affinity */
|
||||||
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
if (unlikely(env.flags & LBF_ALL_PINNED)) {
|
||||||
cpumask_clear_cpu(cpu_of(busiest), cpus);
|
cpumask_clear_cpu(cpu_of(busiest), cpus);
|
||||||
@ -5688,7 +5658,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
|
|||||||
if (time_after_eq(jiffies, sd->last_balance + interval)) {
|
if (time_after_eq(jiffies, sd->last_balance + interval)) {
|
||||||
if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
|
if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
|
||||||
/*
|
/*
|
||||||
* The LBF_SOME_PINNED logic could have changed
|
* The LBF_DST_PINNED logic could have changed
|
||||||
* env->dst_cpu, so we can't know our idle
|
* env->dst_cpu, so we can't know our idle
|
||||||
* state even if we migrated tasks. Update it.
|
* state even if we migrated tasks. Update it.
|
||||||
*/
|
*/
|
||||||
|
@ -605,6 +605,7 @@ struct sched_group_power {
|
|||||||
*/
|
*/
|
||||||
unsigned int power, power_orig;
|
unsigned int power, power_orig;
|
||||||
unsigned long next_update;
|
unsigned long next_update;
|
||||||
|
int imbalance; /* XXX unrelated to power but shared group state */
|
||||||
/*
|
/*
|
||||||
* Number of busy cpus in this group.
|
* Number of busy cpus in this group.
|
||||||
*/
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user