BUG/MAJOR: sched: protect task during removal from wait queue

The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix a race condition when updating the expiration task") is still present when thread groups are enabled, but this time it lies in the scheduler. What happens is that a task configured to run anywhere might already have been queued into one group's wait queue. When updating a stick table entry, sometimes the task will have to be dequeued and requeued. For this a lock is taken on the current thread group's wait queue lock, but while this is necessary for the queuing, it's not sufficient for dequeuing since another thread might be in the process of expiring this task under its own group's lock which is different. This is easy to test using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread groups. The process crashes almost instantly under heavy traffic. One approach could consist in storing the group number the task was queued under in its descriptor (we don't need 32 bits to store the thread id, it's possible to use one short for the tid and another one for the tgrp). Sadly, no safe way to do this was figured, because the race remains at the moment the thread group number is checked, as it might be in the process of being changed by another thread. It seems that a working approach could consist in always having it associated with one group, and only allowing to change it under this group's lock, so that any code trying to change it would have to iterately read it and lock its group until the value matches, confirming it really holds the correct lock. But this seems a bit complicated, particularly with wait_expired_tasks() which already uses upgradable locks to switch from read state to a write state. Given that the shared tasks are not that common (stick-table expirations, rate-limited listeners, maybe resolvers), it doesn't seem worth the extra complexity for now. This patch takes a simpler and safer approach consisting in switching back to a single wq_lock, but still keeping separate wait queues. Given that shared wait queues are almost always empty and that otherwise they're scanned under a read lock, the contention remains manageable and most of the time the lock doesn't even need to be taken since such tasks are not present in a group's queue. In essence, this patch reverts half of the aforementionned patch. This was tested and confirmed to work fine, without observing any performance degradation under any workload. The performance with 8 groups on an EPYC 74F3 and 3 tables remains twice the one of a single group, with the contention remaining on the table's lock first. No backport is needed.
2022-11-22 07:05:44 +01:00 · 2022-11-22 07:05:44 +01:00 · fc50b9dd14
commit fc50b9dd14
parent 469fa47950
3 changed files with 26 additions and 19 deletions
--- a/include/haproxy/task.h
+++ b/include/haproxy/task.h
@ -91,6 +91,8 @@ extern struct pool_head *pool_head_task;
 extern struct pool_head *pool_head_tasklet;
 extern struct pool_head *pool_head_notification;

+__decl_thread(extern HA_RWLOCK_T wq_lock THREAD_ALIGNED(64));
+
 void __tasklet_wakeup_on(struct tasklet *tl, int thr);
 struct list *__tasklet_wakeup_after(struct list *head, struct tasklet *tl);
 void task_kill(struct task *t);
@ -264,10 +266,10 @@ static inline struct task *task_unlink_wq(struct task *t)
 		locked = t->tid < 0;
 		BUG_ON(t->tid >= 0 && t->tid != tid && !(global.mode & MODE_STOPPING));
 		if (locked)
-			HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+			HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
 		__task_unlink_wq(t);
 		if (locked)
-			HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+			HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
 	}
 	return t;
 }
@ -294,10 +296,10 @@ static inline void task_queue(struct task *task)

 #ifdef USE_THREAD
 	if (task->tid < 0) {
-		HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
 		if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
 			__task_queue(task, &tg_ctx->timers);
-		HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
 	} else
 #endif
 	{
@ -654,14 +656,14 @@ static inline void task_schedule(struct task *task, int when)
 #ifdef USE_THREAD
 	if (task->tid < 0) {
 		/* FIXME: is it really needed to lock the WQ during the check ? */
-		HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
 		if (task_in_wq(task))
 			when = tick_first(when, task->expire);

 		task->expire = when;
 		if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key))
 			__task_queue(task, &tg_ctx->timers);
-		HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
 	} else
 #endif
 	{
--- a/include/haproxy/tinfo-t.h
+++ b/include/haproxy/tinfo-t.h
@ -70,7 +70,6 @@ struct tgroup_ctx {
 	ulong threads_idle;               /* mask of threads idling in the poller */
 	ulong stopping_threads;           /* mask of threads currently stopping */

-	HA_RWLOCK_T wq_lock;              /* RW lock related to the wait queue below */
 	struct eb_root timers;            /* wait queue (sorted timers tree, global, accessed under wq_lock) */

 	uint niced_tasks;                 /* number of niced tasks in this group's run queues */
--- a/src/task.c
+++ b/src/task.c
@ -35,6 +35,12 @@ DECLARE_POOL(pool_head_tasklet, "tasklet", sizeof(struct tasklet));
 */
 DECLARE_POOL(pool_head_notification, "notification", sizeof(struct notification));

+/* The lock protecting all wait queues at once. For now we have no better
+ * alternative since a task may have to be removed from a queue and placed
+ * into another one. Storing the WQ index into the task doesn't seem to be
+ * sufficient either.
+ */
+__decl_thread(HA_RWLOCK_T wq_lock THREAD_ALIGNED(64) = 0);

 /* Flags the task <t> for immediate destruction and puts it into its first
 * thread's shared tasklet list if not yet queued/running. This will bypass
@ -362,29 +368,29 @@ void wake_expired_tasks()
 	if (eb_is_empty(&tg_ctx->timers))
 		goto leave;

-	HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+	HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &wq_lock);
 	eb = eb32_lookup_ge(&tg_ctx->timers, now_ms - TIMER_LOOK_BACK);
 	if (!eb) {
 		eb = eb32_first(&tg_ctx->timers);
 		if (likely(!eb)) {
-			HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+			HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
 			goto leave;
 		}
 	}
 	key = eb->key;

 	if (tick_is_lt(now_ms, key)) {
-		HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
 		goto leave;
 	}

 	/* There's really something of interest here, let's visit the queue */

-	if (HA_RWLOCK_TRYRDTOSK(TASK_WQ_LOCK, &tg_ctx->wq_lock)) {
+	if (HA_RWLOCK_TRYRDTOSK(TASK_WQ_LOCK, &wq_lock)) {
 		/* if we failed to grab the lock it means another thread is
 		 * already doing the same here, so let it do the job.
 		 */
-		HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
 		goto leave;
 	}

@ -423,20 +429,20 @@ void wake_expired_tasks()

 		if (tick_is_expired(task->expire, now_ms)) {
 			/* expired task, wake it up */
-			HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+			HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &wq_lock);
 			__task_unlink_wq(task);
-			HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+			HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &wq_lock);
 			task_drop_running(task, TASK_WOKEN_TIMER);
 		}
 		else if (task->expire != eb->key) {
 			/* task is not expired but its key doesn't match so let's
 			 * update it and skip to next apparently expired task.
 			 */
-			HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+			HA_RWLOCK_SKTOWR(TASK_WQ_LOCK, &wq_lock);
 			__task_unlink_wq(task);
 			if (tick_isset(task->expire))
 				__task_queue(task, &tg_ctx->timers);
-			HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+			HA_RWLOCK_WRTOSK(TASK_WQ_LOCK, &wq_lock);
 			task_drop_running(task, 0);
 			goto lookup_next;
 		}
@ -448,7 +454,7 @@ void wake_expired_tasks()
 		}
 	}

-	HA_RWLOCK_SKUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+	HA_RWLOCK_SKUNLOCK(TASK_WQ_LOCK, &wq_lock);
 #endif
 leave:
 	return;
@ -480,13 +486,13 @@ int next_timer_expiry()

 #ifdef USE_THREAD
 	if (!eb_is_empty(&tg_ctx->timers)) {
-		HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_RDLOCK(TASK_WQ_LOCK, &wq_lock);
 		eb = eb32_lookup_ge(&tg_ctx->timers, now_ms - TIMER_LOOK_BACK);
 		if (!eb)
 			eb = eb32_first(&tg_ctx->timers);
 		if (eb)
 			key = eb->key;
-		HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &tg_ctx->wq_lock);
+		HA_RWLOCK_RDUNLOCK(TASK_WQ_LOCK, &wq_lock);
 		if (eb)
 			ret = tick_first(ret, key);
 	}