From 060a7612487c244175fa1dc1e5b224015cbcf503 Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Wed, 10 Mar 2021 11:06:26 +0100
Subject: [PATCH] OPTIM: task: automatically adjust the default runqueue-depth
 to the threads

The recent default runqueue size reduction appeared to have significantly
lowered performance on low-thread count configs. Testing various values
runqueue values on different workloads under thread counts ranging from
1 to 64, it appeared that lower values are more optimal for high thread
counts and conversely. It could even be drawn that the optimal value for
various workloads sits around 280/sqrt(nbthread), and probably has to do
with both the L3 cache usage and how to optimally interlace the threads'
activity to minimize contention. This is much easier to optimally
configure, so let's do this by default now.
---
 doc/configuration.txt      | 13 +++++++------
 include/haproxy/defaults.h | 17 +++++------------
 src/haproxy.c              | 10 ++++++++--
 3 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/doc/configuration.txt b/doc/configuration.txt
index f0801aaba..60fe2a46a 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -2494,12 +2494,13 @@ tune.recv_enough <number>
 
 tune.runqueue-depth <number>
   Sets the maximum amount of task that can be processed at once when running
-  tasks. The default value is 40 which tends to show the highest request rates
-  and lowest latencies. Increasing it may incur latency when dealing with I/Os,
-  making it too small can incur extra overhead. When experimenting with much
-  larger values, it may be useful to also enable tune.sched.low-latency and
-  possibly tune.fd.edge-triggered to limit the maximum latency to the lowest
-  possible.
+  tasks. The default value depends on the number of threads but sits between 35
+  and 280, which tend to show the highest request rates and lowest latencies.
+  Increasing it may incur latency when dealing with I/Os, making it too small
+  can incur extra overhead. Higher thread counts benefit from lower values.
+  When experimenting with much larger values, it may be useful to also enable
+  tune.sched.low-latency and possibly tune.fd.edge-triggered to limit the
+  maximum latency to the lowest possible.
 
 tune.sched.low-latency { on | off }
   Enables ('on') or disables ('off') the low-latency task scheduler. By default
diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h
index 13b5ad3fc..3dc98e5ce 100644
--- a/include/haproxy/defaults.h
+++ b/include/haproxy/defaults.h
@@ -186,19 +186,12 @@
 #define MAX_ACCEPT 4
 #endif
 
-// the max number of tasks to run at once. Tests have shown the following
-// number of requests/s for 1 to 16 threads (1c1t, 1c2t, 2c4t, 4c8t, 4c16t):
-//
-// rq\thr|    1     2     4     8    16
-// ------+------------------------------
-//     32|  120k  159k  276k  477k  698k
-//     40|  122k  160k  276k  478k  722k
-//     48|  121k  159k  274k  482k  720k
-//     64|  121k  160k  274k  469k  710k
-//    200|  114k  150k  247k  415k  613k
-//
+// The base max number of tasks to run at once to be used when not set by
+// tune.runqueue-depth. It will automatically be divided by the square root
+// of the number of threads for better fairness. As such, 64 threads will
+// use 35 and a single thread will use 280.
 #ifndef RUNQUEUE_DEPTH
-#define RUNQUEUE_DEPTH 40
+#define RUNQUEUE_DEPTH 280
 #endif
 
 // cookie delimiter in "prefix" mode. This character is inserted between the
diff --git a/src/haproxy.c b/src/haproxy.c
index 49f6957c3..7b30a78a2 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -2274,8 +2274,14 @@ static void init(int argc, char **argv)
 	if (global.tune.maxpollevents <= 0)
 		global.tune.maxpollevents = MAX_POLL_EVENTS;
 
-	if (global.tune.runqueue_depth <= 0)
-		global.tune.runqueue_depth = RUNQUEUE_DEPTH;
+	if (global.tune.runqueue_depth <= 0) {
+		/* tests on various thread counts from 1 to 64 have shown an
+		 * optimal queue depth following roughly 1/sqrt(threads).
+		 */
+		int s = my_flsl(global.nbthread);
+		s += (global.nbthread / s); // roughly twice the sqrt.
+		global.tune.runqueue_depth = RUNQUEUE_DEPTH * 2 / s;
+	}
 
 	if (global.tune.recv_enough == 0)
 		global.tune.recv_enough = MIN_RECV_AT_ONCE_ENOUGH;