Merge branch 'for-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue updates from Tejun Heo:

 - The code around workqueue scheduler hooks got reorganized early 2019
   which unfortuately introdued a couple subtle and rare race conditions
   where preemption can mangle internal workqueue state triggering a
   WARN and possibly causing a stall or at least delay in execution.

   Frederic fixed both early December and the fixes were sitting in
   for-5.16-fixes which I forgot to push. They are here now. I'll
   forward them to stable after they land.

 - The scheduler hook reorganization has more implicatoins for workqueue
   code in that the hooks are now more strictly synchronized and thus
   the interacting operations can become more straight-forward.

   Lai is in the process of simplifying workqueue code and this pull
   request contains some of the patches.

* 'for-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
  workqueue: Remove the cacheline_aligned for nr_running
  workqueue: Move the code of waking a worker up in unbind_workers()
  workqueue: Remove schedule() in unbind_workers()
  workqueue: Remove outdated comment about exceptional workers in unbind_workers()
  workqueue: Remove the advanced kicking of the idle workers in rebind_workers()
  workqueue: Remove the outdated comment before wq_worker_sleeping()
  workqueue: Fix unbind_workers() VS wq_worker_sleeping() race
  workqueue: Fix unbind_workers() VS wq_worker_running() race
  workqueue: Upgrade queue_work_on() comment
This commit is contained in:
Linus Torvalds 2022-01-11 09:19:29 -08:00
commit e9e64f85b4

View File

@ -154,6 +154,9 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */ unsigned long watchdog_ts; /* L: watchdog timestamp */
/* The current concurrency level. */
atomic_t nr_running;
struct list_head worklist; /* L: list of pending works */ struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */ int nr_workers; /* L: total number of workers */
@ -177,19 +180,12 @@ struct worker_pool {
struct hlist_node hash_node; /* PL: unbound_pool_hash node */ struct hlist_node hash_node; /* PL: unbound_pool_hash node */
int refcnt; /* PL: refcnt for unbound pools */ int refcnt; /* PL: refcnt for unbound pools */
/*
* The current concurrency level. As it's likely to be accessed
* from other CPUs during try_to_wake_up(), put it in a separate
* cacheline.
*/
atomic_t nr_running ____cacheline_aligned_in_smp;
/* /*
* Destruction of pool is RCU protected to allow dereferences * Destruction of pool is RCU protected to allow dereferences
* from get_work_pool(). * from get_work_pool().
*/ */
struct rcu_head rcu; struct rcu_head rcu;
} ____cacheline_aligned_in_smp; };
/* /*
* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
@ -868,8 +864,17 @@ void wq_worker_running(struct task_struct *task)
if (!worker->sleeping) if (!worker->sleeping)
return; return;
/*
* If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
* and the nr_running increment below, we may ruin the nr_running reset
* and leave with an unexpected pool->nr_running == 1 on the newly unbound
* pool. Protect against such race.
*/
preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running); atomic_inc(&worker->pool->nr_running);
preempt_enable();
worker->sleeping = 0; worker->sleeping = 0;
} }
@ -878,8 +883,7 @@ void wq_worker_running(struct task_struct *task)
* @task: task going to sleep * @task: task going to sleep
* *
* This function is called from schedule() when a busy worker is * This function is called from schedule() when a busy worker is
* going to sleep. Preemption needs to be disabled to protect ->sleeping * going to sleep.
* assignment.
*/ */
void wq_worker_sleeping(struct task_struct *task) void wq_worker_sleeping(struct task_struct *task)
{ {
@ -903,6 +907,16 @@ void wq_worker_sleeping(struct task_struct *task)
worker->sleeping = 1; worker->sleeping = 1;
raw_spin_lock_irq(&pool->lock); raw_spin_lock_irq(&pool->lock);
/*
* Recheck in case unbind_workers() preempted us. We don't
* want to decrement nr_running after the worker is unbound
* and nr_running has been reset.
*/
if (worker->flags & WORKER_NOT_RUNNING) {
raw_spin_unlock_irq(&pool->lock);
return;
}
/* /*
* The counterpart of the following dec_and_test, implied mb, * The counterpart of the following dec_and_test, implied mb,
* worklist not empty test sequence is in insert_work(). * worklist not empty test sequence is in insert_work().
@ -1531,7 +1545,8 @@ out:
* @work: work to queue * @work: work to queue
* *
* We queue the work to a specific CPU, the caller must ensure it * We queue the work to a specific CPU, the caller must ensure it
* can't go away. * can't go away. Callers that fail to ensure that the specified
* CPU cannot go away will execute on a randomly chosen CPU.
* *
* Return: %false if @work was already on a queue, %true otherwise. * Return: %false if @work was already on a queue, %true otherwise.
*/ */
@ -1811,14 +1826,8 @@ static void worker_enter_idle(struct worker *worker)
if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/* /* Sanity check nr_running. */
* Sanity check nr_running. Because unbind_workers() releases WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
* pool->lock between setting %WORKER_UNBOUND and zapping
* nr_running, the warning may trigger spuriously. Check iff
* unbind is not in progress.
*/
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
pool->nr_workers == pool->nr_idle &&
atomic_read(&pool->nr_running)); atomic_read(&pool->nr_running));
} }
@ -4979,38 +4988,22 @@ static void unbind_workers(int cpu)
/* /*
* We've blocked all attach/detach operations. Make all workers * We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers * unbound and set DISASSOCIATED. Before this, all workers
* except for the ones which are still executing works from * must be on the cpu. After this, they may become diasporas.
* before the last CPU down must be on the cpu. After * And the preemption disabled section in their sched callbacks
* this, they may become diasporas. * are guaranteed to see WORKER_UNBOUND since the code here
* is on the same cpu.
*/ */
for_each_pool_worker(worker, pool) for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND; worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED; pool->flags |= POOL_DISASSOCIATED;
raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, -1);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex);
/* /*
* Call schedule() so that we cross rq->lock and thus can * The handling of nr_running in sched callbacks are disabled
* guarantee sched callbacks see the %WORKER_UNBOUND flag. * now. Zap nr_running. After this, nr_running stays zero and
* This is necessary as scheduler callbacks may be invoked * need_more_worker() and keep_working() are always true as
* from other cpus. * long as the worklist is not empty. This pool now behaves as
*/ * an unbound (in terms of concurrency management) pool which
schedule();
/*
* Sched callbacks are disabled now. Zap nr_running.
* After this, nr_running stays zero and need_more_worker()
* and keep_working() are always true as long as the
* worklist is not empty. This pool now behaves as an
* unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool. * are served by workers tied to the pool.
*/ */
atomic_set(&pool->nr_running, 0); atomic_set(&pool->nr_running, 0);
@ -5020,9 +5013,16 @@ static void unbind_workers(int cpu)
* worker blocking could lead to lengthy stalls. Kick off * worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items. * unbound chain execution of currently pending work items.
*/ */
raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool); wake_up_worker(pool);
raw_spin_unlock_irq(&pool->lock); raw_spin_unlock_irq(&pool->lock);
for_each_pool_worker(worker, pool) {
kthread_set_per_cpu(worker->task, -1);
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}
mutex_unlock(&wq_pool_attach_mutex);
} }
} }
@ -5058,17 +5058,6 @@ static void rebind_workers(struct worker_pool *pool)
for_each_pool_worker(worker, pool) { for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags; unsigned int worker_flags = worker->flags;
/*
* A bound idle worker should actually be on the runqueue
* of the associated CPU for local wake-ups targeting it to
* work. Kick all idle workers so that they migrate to the
* associated CPU. Doing this in the same loop as
* replacing UNBOUND with REBOUND is safe as no worker will
* be bound before @pool->lock is released.
*/
if (worker_flags & WORKER_IDLE)
wake_up_process(worker->task);
/* /*
* We want to clear UNBOUND but can't directly call * We want to clear UNBOUND but can't directly call
* worker_clr_flags() or adjust nr_running. Atomically * worker_clr_flags() or adjust nr_running. Atomically