BUG/MAJOR: sched: protect task during removal from wait queue

The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.

What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.

For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.

One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.

Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.

No backport is needed.
This commit is contained in:
Willy Tarreau 2022-11-22 07:05:44 +01:00
parent 469fa47950
commit fc50b9dd14
3 changed files with 26 additions and 19 deletions

View File

@ -91,6 +91,8 @@ extern struct pool_head *pool_head_task;
extern struct pool_head *pool_head_tasklet;
extern struct pool_head *pool_head_notification;
__decl_thread(extern HA_RWLOCK_T wq_lock THREAD_ALIGNED(64));
void __tasklet_wakeup_on(struct tasklet *tl, int thr);
struct list *__tasklet_wakeup_after(struct list *head, struct tasklet *tl);
void task_kill(struct task *t);
@ -264,10 +266,10 @@ static inline struct task *task_unlink_wq(struct task *t)
locked = t->tid < 0;
BUG_ON(t->tid >= 0 && t->tid != tid && !(global.mode & MODE_STOPPING));
if (locked)
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
__task_unlink_wq(t);
if (locked)
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
}
return t;
}
@ -294,10 +296,10 @@ static inline void task_queue(struct task *task)
#ifdef USE_THREAD
if (task->tid < 0) {
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
__task_queue(task, &tg_ctx->timers);
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
} else
#endif
{
@ -654,14 +656,14 @@ static inline void task_schedule(struct task *task, int when)
#ifdef USE_THREAD
if (task->tid < 0) {
/* FIXME: is it really needed to lock the WQ during the check ? */
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
if (task_in_wq(task))
when = tick_first(when, task->expire);
task->expire = when;
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
__task_queue(task, &tg_ctx->timers);
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
} else
#endif
{

View File

@ -70,7 +70,6 @@ struct tgroup_ctx {
ulong threads_idle; /* mask of threads idling in the poller */
ulong stopping_threads; /* mask of threads currently stopping */
HA_RWLOCK_T wq_lock; /* RW lock related to the wait queue below */
struct eb_root timers; /* wait queue (sorted timers tree, global, accessed under wq_lock) */
uint niced_tasks; /* number of niced tasks in this group's run queues */

View File

@ -35,6 +35,12 @@ DECLARE_POOL(pool_head_tasklet, "tasklet", sizeof(struct tasklet));
*/
DECLARE_POOL(pool_head_notification, "notification", sizeof(struct notification));
/* The lock protecting all wait queues at once. For now we have no better
* alternative since a task may have to be removed from a queue and placed
* into another one. Storing the WQ index into the task doesn't seem to be
* sufficient either.
*/
__decl_thread(HA_RWLOCK_T wq_lock THREAD_ALIGNED(64) = 0);
/* Flags the task <t> for immediate destruction and puts it into its first
* thread's shared tasklet list if not yet queued/running. This will bypass
@ -362,29 +368,29 @@ void wake_expired_tasks()
if (eb_is_empty(&tg_ctx->timers))
goto leave;
HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &wq_lock);
eb = eb32_lookup_ge(&tg_ctx->timers, now_ms - TIMER_LOOK_BACK);
if (!eb) {
eb = eb32_first(&tg_ctx->timers);
if (likely(!eb)) {
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
goto leave;
}
}
key = eb->key;
if (tick_is_lt(now_ms, key)) {
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
goto leave;
}
/* There's really something of interest here, let's visit the queue */
if (HA_RWLOCK_TRYRDTOSK(TASK_WQ_LOCK, &tg_ctx->wq_lock)) {
if (HA_RWLOCK_TRYRDTOSK(TASK_WQ_LOCK, &wq_lock)) {
/* if we failed to grab the lock it means another thread is
* already doing the same here, so let it do the job.
*/
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
goto leave;
}
@ -423,20 +429,20 @@ void wake_expired_tasks()
if (tick_is_expired(task->expire, now_ms)) {
/* expired task, wake it up */
HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &wq_lock);
__task_unlink_wq(task);
HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &wq_lock);
task_drop_running(task, TASK_WOKEN_TIMER);
}
else if (task->expire != eb->key) {
/* task is not expired but its key doesn't match so let's
* update it and skip to next apparently expired task.
*/
HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &wq_lock);
__task_unlink_wq(task);
if (tick_isset(task->expire))
__task_queue(task, &tg_ctx->timers);
HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &wq_lock);
task_drop_running(task, 0);
goto lookup_next;
}
@ -448,7 +454,7 @@ void wake_expired_tasks()
}
}
HA_RWLOCK_SKUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_SKUNLOCK(TASK_WQ_LOCK, &wq_lock);
#endif
leave:
return;
@ -480,13 +486,13 @@ int next_timer_expiry()
#ifdef USE_THREAD
if (!eb_is_empty(&tg_ctx->timers)) {
HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &wq_lock);
eb = eb32_lookup_ge(&tg_ctx->timers, now_ms - TIMER_LOOK_BACK);
if (!eb)
eb = eb32_first(&tg_ctx->timers);
if (eb)
key = eb->key;
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
if (eb)
ret = tick_first(ret, key);
}