- Correct the marking of kthreads which are supposed to run on a specific,

single CPU vs such which are affine to only one CPU, mark per-cpu workqueue
  threads as such and make sure that marking "survives" CPU hotplug. Fix CPU
  hotplug issues with such kthreads.
 
  - A fix to not push away tasks on CPUs coming online.
 
  - Have workqueue CPU hotplug code use cpu_possible_mask when breaking affinity
    on CPU offlining so that pending workers can finish on newly arrived onlined
    CPUs too.
 
  - Dump tasks which haven't vacated a CPU which is currently being unplugged.
 
  - Register a special scale invariance callback which gets called on resume
  from RAM to read out APERF/MPERF after resume and thus make the schedutil
  scaling governor more precise.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmANYCAACgkQEsHwGGHe
 VUo+OBAAjfqkijDlXiGX6lrT5gRx5NZICpeMgbWa7J13XHT1ysD/b0fMGFIUyF6k
 aszDLTl8U/S1/qGAYlzTSPAFcdZ+ENiFqQ48ozMk4jZC3p0quHTjs/PdiSG6kYBi
 +e4smht+bSyLKxsG8hN0kJ+mLEd+uIQ13kP4YkxPgWbJ9WNP/U6HHGBo0rBchtSe
 Kn6bdd8CfwmC6rSazp7kdQoFoWeQaoMI1ODX3VphK1GtL1wq8WSICzRhpg3caeyG
 3lCIddoNW9mCA9Nkc6R6HeV3uW9JGkPAjnmtTIEHDbg9pib7xNT978ieTQuqNDCi
 DlAHDGumzoaiVJZhD/1fj/RXMJr2YUHxtrXWNsXpiKJ9g8Tn+WC0UW/4+Mx2L/km
 0RSoXJlMs1fGopS2I/fObZ6RPhmg4D+gJsMCdaHQzX4NgxZAGhNNPxMckZ0IM8A0
 2NNXSHUZHVTHeJEW0E/glOcpWb5hG+vDwiBMNEWfTwYpTfrw2EEOZaKniZE7WlSL
 4ItM9rkLGl1KToJzAH4A0oUtSy3vtSCo8B1noGlc09Lj+oCIBlr81z9+C79a2oxG
 qE7Xd4X7y7Qs3JeCbRZWQa7/2Kf1v4XnjELrJJeCZC85r0ZqJDwRX8w7lkmW2XPU
 m4J2prr/DDZSqrRh23/xC1fsU+vcBKSfKUFKAH4Lg2VIaUfSUEk=
 =2DAF
 -----END PGP SIGNATURE-----

Merge tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

 - Correct the marking of kthreads which are supposed to run on a
   specific, single CPU vs such which are affine to only one CPU, mark
   per-cpu workqueue threads as such and make sure that marking
   "survives" CPU hotplug. Fix CPU hotplug issues with such kthreads.

 - A fix to not push away tasks on CPUs coming online.

 - Have workqueue CPU hotplug code use cpu_possible_mask when breaking
   affinity on CPU offlining so that pending workers can finish on newly
   arrived onlined CPUs too.

 - Dump tasks which haven't vacated a CPU which is currently being
   unplugged.

 - Register a special scale invariance callback which gets called on
   resume from RAM to read out APERF/MPERF after resume and thus make
   the schedutil scaling governor more precise.

* tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Relax the set_cpus_allowed_ptr() semantics
  sched: Fix CPU hotplug / tighten is_per_cpu_kthread()
  sched: Prepare to use balance_push in ttwu()
  workqueue: Restrict affinity change to rescuer
  workqueue: Tag bound workers with KTHREAD_IS_PER_CPU
  kthread: Extract KTHREAD_IS_PER_CPU
  sched: Don't run cpu-online with balance_push() enabled
  workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity
  sched/core: Print out straggler tasks in sched_cpu_dying()
  x86: PM: Register syscore_ops for scale invariance
This commit is contained in:
Linus Torvalds 2021-01-24 10:09:20 -08:00
commit 24c56ee06c
7 changed files with 151 additions and 33 deletions

View File

@ -56,6 +56,7 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
#include <linux/syscore_ops.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@ -2083,6 +2084,23 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
}
#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
.resume = init_counter_refs,
};
static void register_freq_invariance_syscore_ops(void)
{
/* Bail out if registered already. */
if (freq_invariance_syscore_ops.node.prev)
return;
register_syscore_ops(&freq_invariance_syscore_ops);
}
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif
static void init_freq_invariance(bool secondary, bool cppc_ready)
{
bool ret = false;
@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
if (ret) {
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");

View File

@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
unsigned int cpu,
const char *namefmt);
void kthread_set_per_cpu(struct task_struct *k, int cpu);
bool kthread_is_per_cpu(struct task_struct *k);
/**
* kthread_run - create and wake a thread.
* @threadfn: the function to run until signal_pending(current).

View File

@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
return p;
kthread_bind(p, cpu);
/* CPU hotplug need to bind once again when unparking the thread. */
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
to_kthread(p)->cpu = cpu;
return p;
}
void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
struct kthread *kthread = to_kthread(k);
if (!kthread)
return;
WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
if (cpu < 0) {
clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
return;
}
kthread->cpu = cpu;
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
bool kthread_is_per_cpu(struct task_struct *k)
{
struct kthread *kthread = to_kthread(k);
if (!kthread)
return false;
return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}
/**
* kthread_unpark - unpark a thread created by kthread_create().
* @k: thread created by kthread_create().

View File

@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
*/
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
/* When not in the task's cpumask, no point in looking further. */
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
if (is_per_cpu_kthread(p) || is_migration_disabled(p))
/* migrate_disabled() must be allowed to finish. */
if (is_migration_disabled(p))
return cpu_online(cpu);
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
return cpu_active(cpu);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
return cpu_online(cpu);
/* Regular kernel threads don't get to stay during offline. */
if (cpu_rq(cpu)->balance_push)
return false;
/* But are allowed during online. */
return cpu_online(cpu);
}
/*
@ -2327,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs.
* Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed
* away if not KTHREAD_IS_PER_CPU.
*
* Specifically, migration_disabled() tasks must not fail the
* cpumask_any_and_distribute() pick below, esp. so on
@ -2371,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
__do_set_cpus_allowed(p, new_mask, flags);
if (p->flags & PF_KTHREAD) {
/*
* For kernel threads that do indeed end up on online &&
* !active we want to ensure they are strict per-CPU threads.
*/
WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
!cpumask_intersects(new_mask, cpu_active_mask) &&
p->nr_cpus_allowed != 1);
}
return affine_move_task(rq, p, &rf, dest_cpu, flags);
out:
@ -3121,6 +3128,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
{
/*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
*/
if (!cpu_active(cpu))
return false;
/*
* If the CPU does not share cache, then queue the task on the
* remote rqs wakelist to avoid accessing remote data.
@ -7276,8 +7290,14 @@ static void balance_push(struct rq *rq)
/*
* Both the cpu-hotplug and stop task are in this case and are
* required to complete the hotplug process.
*
* XXX: the idle task does not match kthread_is_per_cpu() due to
* histerical raisins.
*/
if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
if (rq->idle == push_task ||
((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
is_migration_disabled(push_task)) {
/*
* If this is the idle task on the outgoing CPU try to wake
* up the hotplug control thread which might wait for the
@ -7309,7 +7329,7 @@ static void balance_push(struct rq *rq)
/*
* At this point need_resched() is true and we'll take the loop in
* schedule(). The next pick is obviously going to be the stop task
* which is_per_cpu_kthread() and will push this task away.
* which kthread_is_per_cpu() and will push this task away.
*/
raw_spin_lock(&rq->lock);
}
@ -7320,10 +7340,13 @@ static void balance_push_set(int cpu, bool on)
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
if (on)
rq->balance_push = on;
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = &balance_push_callback;
else
} else if (rq->balance_callback == &balance_push_callback) {
rq->balance_callback = NULL;
}
rq_unlock_irqrestore(rq, &rf);
}
@ -7441,6 +7464,10 @@ int sched_cpu_activate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
/*
* Make sure that when the hotplug state machine does a roll-back
* we clear balance_push. Ideally that would happen earlier...
*/
balance_push_set(cpu, false);
#ifdef CONFIG_SCHED_SMT
@ -7483,17 +7510,27 @@ int sched_cpu_deactivate(unsigned int cpu)
int ret;
set_cpu_active(cpu, false);
/*
* We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
* users of this state to go away such that all new such users will
* observe it.
* From this point forward, this CPU will refuse to run any task that
* is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
* push those tasks away until this gets cleared, see
* sched_cpu_dying().
*/
balance_push_set(cpu, true);
/*
* We've cleared cpu_active_mask / set balance_push, wait for all
* preempt-disabled and RCU users of this state to go away such that
* all new such users will observe it.
*
* Specifically, we rely on ttwu to no longer target this CPU, see
* ttwu_queue_cond() and is_cpu_allowed().
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
synchronize_rcu();
balance_push_set(cpu, true);
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
update_rq_clock(rq);
@ -7574,6 +7611,25 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
static void dump_rq_tasks(struct rq *rq, const char *loglvl)
{
struct task_struct *g, *p;
int cpu = cpu_of(rq);
lockdep_assert_held(&rq->lock);
printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
for_each_process_thread(g, p) {
if (task_cpu(p) != cpu)
continue;
if (!task_on_rq_queued(p))
continue;
printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
}
}
int sched_cpu_dying(unsigned int cpu)
{
struct rq *rq = cpu_rq(cpu);
@ -7583,9 +7639,18 @@ int sched_cpu_dying(unsigned int cpu)
sched_tick_stop(cpu);
rq_lock_irqsave(rq, &rf);
BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
WARN(true, "Dying CPU not properly vacated!");
dump_rq_tasks(rq, KERN_WARNING);
}
rq_unlock_irqrestore(rq, &rf);
/*
* Now that the CPU is offline, make sure we're welcome
* to new tasks once we come back up.
*/
balance_push_set(cpu, false);
calc_load_migrate(rq);
update_max_interval();
nohz_balance_exit_idle(rq);

View File

@ -975,6 +975,7 @@ struct rq {
unsigned long cpu_capacity_orig;
struct callback_head *balance_callback;
unsigned char balance_push;
unsigned char nohz_idle_balance;
unsigned char idle_balance;

View File

@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
kfree(td);
return PTR_ERR(tsk);
}
kthread_set_per_cpu(tsk, cpu);
/*
* Park the thread so that it could start right on the CPU
* when it is available.

View File

@ -1848,12 +1848,6 @@ static void worker_attach_to_pool(struct worker *worker,
{
mutex_lock(&wq_pool_attach_mutex);
/*
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
* online CPUs. It'll be re-applied when any of the CPUs come up.
*/
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
/*
* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
* stable across this function. See the comments above the flag
@ -1861,6 +1855,11 @@ static void worker_attach_to_pool(struct worker *worker,
*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
else
kthread_set_per_cpu(worker->task, pool->cpu);
if (worker->rescue_wq)
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
list_add_tail(&worker->node, &pool->workers);
worker->pool = pool;
@ -1883,6 +1882,7 @@ static void worker_detach_from_pool(struct worker *worker)
mutex_lock(&wq_pool_attach_mutex);
kthread_set_per_cpu(worker->task, -1);
list_del(&worker->node);
worker->pool = NULL;
@ -4919,8 +4919,10 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, -1);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex);
@ -4972,9 +4974,11 @@ static void rebind_workers(struct worker_pool *pool)
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
for_each_pool_worker(worker, pool)
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, pool->cpu);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);
}
raw_spin_lock_irq(&pool->lock);