From 41211ebc72bab0dca1716d6afdaf71066df67583 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 24 Jan 2023 12:11:41 +0100 Subject: [PATCH 001/140] BUG/MINOR: sink: make sure to always properly unmap a file-backed ring The munmap() call performed on exit was incorrect since it used to apply to the buffer instead of the area, so neither the pointer nor the size were page-aligned. This patches corrects this and also adds a call to msync() since munmap() alone doesn't guarantee that data will be dumped. This should be backported to 2.6. (cherry picked from commit fb9a4765b74cc18f25a6b732a66aae952262a7e0) Signed-off-by: Willy Tarreau (cherry picked from commit 7627fd761e090f9e4f538da5a4af7d28de38df67) Signed-off-by: Christopher Faulet --- src/sink.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/sink.c b/src/sink.c index ef3d0f0f2..f897e99f0 100644 --- a/src/sink.c +++ b/src/sink.c @@ -1393,8 +1393,13 @@ static void sink_deinit() list_for_each_entry_safe(sink, sb, &sink_list, sink_list) { if (sink->type == SINK_TYPE_BUFFER) { - if (sink->store) - munmap(sink->ctx.ring->buf.area, sink->ctx.ring->buf.size); + if (sink->store) { + size_t size = (sink->ctx.ring->buf.size + 4095UL) & -4096UL; + void *area = (sink->ctx.ring->buf.area - sizeof(*sink->ctx.ring)); + + msync(area, size, MS_SYNC); + munmap(area, size); + } else ring_free(sink->ctx.ring); } From cb09e5787407f8ebd4a9aba33022743420e3ad19 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 24 Jan 2023 12:13:14 +0100 Subject: [PATCH 002/140] DEV: haring: add a new option "-r" to automatically repair broken files In case a file-backed ring was not properly synced before being dumped, the output can look bogus due to the head pointer not being perfectly up to date. In this case, passing "-r" will make haring automatically skip entries not starting with a zero, and resynchronize with the rest of the messages. This should be backported to 2.6. (cherry picked from commit e06ba9031843b6a55df486f5115b6675d71e22f7) Signed-off-by: Willy Tarreau (cherry picked from commit 562690e04df4180b31d5adc904b0714ea49a5fd2) Signed-off-by: Christopher Faulet --- dev/haring/haring.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dev/haring/haring.c b/dev/haring/haring.c index 27e03eecf..53352cbd1 100644 --- a/dev/haring/haring.c +++ b/dev/haring/haring.c @@ -37,6 +37,7 @@ int force = 0; // force access to a different layout int lfremap = 0; // remap LF in traces +int repair = 0; // repair file /* display the message and exit with the code */ @@ -61,6 +62,7 @@ __attribute__((noreturn)) void usage(int code, const char *arg0) "options :\n" " -f : force accessing a non-matching layout for 'ring struct'\n" " -l : replace LF in contents with CR VT\n" + " -r : \"repair\" corrupted file (actively search for message boundaries)\n" "\n" "", arg0); } @@ -146,6 +148,14 @@ int dump_ring(struct ring *ring, size_t ofs, int flags) * stop before the end. */ while (ofs + 1 < b_data(&buf)) { + if (unlikely(repair && *b_peek(&buf, ofs))) { + /* in repair mode we consider that we could have landed + * in the middle of a message so we skip all bytes till + * the next zero. + */ + ofs++; + continue; + } cnt = 1; len = b_peek_varint(&buf, ofs + cnt, &msg_len); if (!len) @@ -219,6 +229,8 @@ int main(int argc, char **argv) force = 1; else if (strcmp(argv[0], "-l") == 0) lfremap = 1; + else if (strcmp(argv[0], "-r") == 0) + repair = 1; else if (strcmp(argv[0], "--") == 0) break; else From e4b97e28bb6d10db2a4692142f0ca0a4f466d441 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 26 Jan 2023 15:32:12 +0100 Subject: [PATCH 003/140] BUG/MINOR: log: release global log servers on exit Since 2.6 we have a free_logsrv() function that is used to release log servers. It must be called from deinit() instead of manually iterating over the log servers, otherwise some parts of the structure are not freed (namely the ring name), as reported by ASAN. This should be backported to 2.6. (cherry picked from commit 2c701dbc0764157aa74dd90ed9a4c1133ddce97b) Signed-off-by: Willy Tarreau (cherry picked from commit d710d820a30d13ae76f52407fbc49e86c2625958) Signed-off-by: Christopher Faulet --- src/haproxy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/haproxy.c b/src/haproxy.c index b02958d3c..809c9eb05 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -2727,10 +2727,10 @@ void deinit(void) idle_conn_task = NULL; list_for_each_entry_safe(log, logb, &global.logsrvs, list) { - LIST_DELETE(&log->list); - free(log->conf.file); - free(log); - } + LIST_DEL_INIT(&log->list); + free_logsrv(log); + } + list_for_each_entry_safe(wl, wlb, &cfg_cfgfiles, list) { free(wl->s); LIST_DELETE(&wl->list); From f2433e9ac2e42bcc4185598221005fafa6b9577a Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 26 Jan 2023 15:46:08 +0100 Subject: [PATCH 004/140] BUG/MINOR: sink: free the forwarding task on exit ASAN reported a small leak of the sink's forwarding task on exit. This should be backported as far as 2.2. (cherry picked from commit 09727ee201211df47a3158334e73a8346f599716) Signed-off-by: Willy Tarreau (cherry picked from commit c5a51631a90846551271eebfaba24f763d88c55e) Signed-off-by: Christopher Faulet --- src/sink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sink.c b/src/sink.c index f897e99f0..8f95ef984 100644 --- a/src/sink.c +++ b/src/sink.c @@ -1404,6 +1404,7 @@ static void sink_deinit() ring_free(sink->ctx.ring); } LIST_DELETE(&sink->sink_list); + task_destroy(sink->forward_task); free(sink->name); free(sink->desc); free(sink); From 6a46ea603ccdc16f464dd6364454a1dc44cfcac9 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Wed, 25 Jan 2023 16:35:00 +0100 Subject: [PATCH 005/140] DEV: hpack: fix `trash` build regression Since 7d84439 ("BUILD: hpack: include global.h for the trash that is needed in debug mode"), hpack decode tool fails to compile on targets that enable USE_THREAD. (ie: linux-glibc target as reported by Christian Ruppert) When building hpack devtool, we are including src/hpack-dec.c as a dependency. src/hpack-dec.c relies on the global trash whe debug mode is enabled. But as we're building hpack tool with a limited scope of haproxy sources, global trash (which is declared in src/chunk.c) is not available. Thus, src/hpack-dec.c relies on a local 'trash' variable declared within dev/hpack/decode.c This used to work fine until 7d84439. But now that global.h is explicitely included in src/hpack-dec.c, trash variable definition from decode.c conflicts with the one from global.h: In file included from include/../src/hpack-dec.c:35, from dev/hpack/decode.c:87: include/haproxy/global.h:52:35: error: thread-local declaration of 'trash' follows non-thread-local declaration 52 | extern THREAD_LOCAL struct buffer trash; Adding THREAD_LOCAL attribute to 'decode.c' local trash variable definition makes the compiler happy again. This should fix GH issue #2009 and should be backported to 2.7. (cherry picked from commit 532ebee38e83b0b406e91af88262432012d943f4) Signed-off-by: Willy Tarreau (cherry picked from commit 29d13c9a7d8b764b04efc18aae04dead19fb7c1c) Signed-off-by: Willy Tarreau --- dev/hpack/decode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/hpack/decode.c b/dev/hpack/decode.c index ae825120f..13c95c77a 100644 --- a/dev/hpack/decode.c +++ b/dev/hpack/decode.c @@ -30,7 +30,7 @@ uint8_t buf[MAX_RQ_SIZE]; char trash_buf[MAX_RQ_SIZE]; char tmp_buf[MAX_RQ_SIZE]; -struct buffer trash = { .area = trash_buf, .data = 0, .size = sizeof(trash_buf) }; +THREAD_LOCAL struct buffer trash = { .area = trash_buf, .data = 0, .size = sizeof(trash_buf) }; struct buffer tmp = { .area = tmp_buf, .data = 0, .size = sizeof(tmp_buf) }; /* displays a long memory block at , assuming first byte of From 8ad76b6dd77d1ea0fe63f66bb36a66803f296af8 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 12 Jan 2023 15:44:22 +0100 Subject: [PATCH 006/140] BUG/MINOR: fcgi-app: prevent 'use-fcgi-app' in default section Despite the doc saying that 'use-fcgi-app' keyword may only be used in backend or listen section, we forgot to prevent its usage in default section. This is wrong because fcgi relies on a filter, and filters cannot be defined in a default section. Making sure such usage reports an error to the user and complies with the doc. This could be backported up to 2.2. (cherry picked from commit d49a580fdaebbd185a9ee6ede072fc13bec5d5bf) Signed-off-by: Willy Tarreau (cherry picked from commit 3269942e18510b6c8aa0be23a6574bd7bbc73f27) Signed-off-by: Willy Tarreau --- src/fcgi-app.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fcgi-app.c b/src/fcgi-app.c index c9405ec4d..1ece72b25 100644 --- a/src/fcgi-app.c +++ b/src/fcgi-app.c @@ -589,7 +589,7 @@ static int proxy_parse_use_fcgi_app(char **args, int section, struct proxy *curp struct fcgi_flt_conf *fcgi_conf = NULL; int retval = 0; - if (!(curpx->cap & PR_CAP_BE)) { + if ((curpx->cap & PR_CAP_DEF) || !(curpx->cap & PR_CAP_BE)) { memprintf(err, "'%s' only available in backend or listen section", args[0]); retval = -1; goto end; From 110d1245b1b5fc9a96085b121fb3585886ff0d93 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Mon, 30 Jan 2023 12:12:11 +0100 Subject: [PATCH 007/140] MINOR: mux-quic/h3: define stream close callback Define a new qcc_app_ops callback named close(). This will be used to notify app-layer about the closure of a stream by the remote peer. Its main usage is to ensure that the closure is allowed by the application protocol specification. For the moment, close is not implemented by H3 layer. However, this function will be mandatory to properly reject a STOP_SENDING on the control stream and preventing a later crash. As such, this commit must be backported with the next one on 2.6. This is related to github issue #2006. (cherry picked from commit 1e340ba6bc0f747bf94e14c91f0351a9a0d7cf03) Signed-off-by: Willy Tarreau (cherry picked from commit b403127cdb6fbac47dbf16c0587166337d2531b3) Signed-off-by: Willy Tarreau --- include/haproxy/mux_quic-t.h | 7 +++++++ src/h3.c | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/haproxy/mux_quic-t.h b/include/haproxy/mux_quic-t.h index d716e08d5..4655a9b14 100644 --- a/include/haproxy/mux_quic-t.h +++ b/include/haproxy/mux_quic-t.h @@ -183,12 +183,19 @@ struct qcs { int start; /* base timestamp for http-request timeout */ }; +/* Used as qcc_app_ops.close callback argument. */ +enum qcc_app_ops_close_side { + QCC_APP_OPS_CLOSE_SIDE_RD, /* Read channel closed (RESET_STREAM received). */ + QCC_APP_OPS_CLOSE_SIDE_WR /* Write channel closed (STOP_SENDING received). */ +}; + /* QUIC application layer operations */ struct qcc_app_ops { int (*init)(struct qcc *qcc); int (*attach)(struct qcs *qcs, void *conn_ctx); ssize_t (*decode_qcs)(struct qcs *qcs, struct buffer *b, int fin); size_t (*snd_buf)(struct qcs *qcs, struct htx *htx, size_t count); + int (*close)(struct qcs *qcs, enum qcc_app_ops_close_side side); void (*detach)(struct qcs *qcs); int (*finalize)(void *ctx); void (*shutdown)(void *ctx); /* Close a connection. */ diff --git a/src/h3.c b/src/h3.c index abede397d..98160217d 100644 --- a/src/h3.c +++ b/src/h3.c @@ -1301,6 +1301,24 @@ static size_t h3_snd_buf(struct qcs *qcs, struct htx *htx, size_t count) return total; } +/* Notify about a closure on stream requested by the remote peer. + * + * Stream channel is explained relative to our endpoint : WR for + * STOP_SENDING or RD for RESET_STREAM reception. Callback decode_qcs() is used + * instead for closure performed using a STREAM frame with FIN bit. + * + * The main objective of this function is to check if closure is valid + * according to HTTP/3 specification. + * + * Returns 0 on success else non-zero. A CONNECTION_CLOSE is generated on + * error. + */ +static int h3_close(struct qcs *qcs, enum qcc_app_ops_close_side side) +{ + /* TODO */ + return 0; +} + static int h3_attach(struct qcs *qcs, void *conn_ctx) { struct h3s *h3s; @@ -1489,6 +1507,7 @@ const struct qcc_app_ops h3_ops = { .attach = h3_attach, .decode_qcs = h3_decode_qcs, .snd_buf = h3_snd_buf, + .close = h3_close, .detach = h3_detach, .finalize = h3_finalize, .shutdown = h3_shutdown, From 26d42e957e205a97eb4d6bd5ccf1fbfbb5258bbe Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Mon, 30 Jan 2023 12:12:43 +0100 Subject: [PATCH 008/140] BUG/MEDIUM: h3: handle STOP_SENDING on control stream Before this patch, STOP_SENDING reception was considered valid even on H3 control stream. This causes the emission in return of RESET_STREAM and eventually the closure and freeing of the QCS instance. This then causes a crash during connection closure as a GOAWAY frame is emitted on the control stream which is now released. To fix this crash, STOP_SENDING on the control stream is now properly rejected as specified by RFC 9114. The new app_ops close callback is used which in turn will generate a CONNECTION_CLOSE with error H3_CLOSED_CRITICAL_STREAM. This bug was detected in github issue #2006. Note that however it is triggered by an incorrect client behavior. It may be useful to determine which client behaves like this. If this case is too frequent, STOP_SENDING should probably be silently ignored. To reproduce this issue, quiche was patched to emit a STOP_SENDING on its send() function in quiche/src/lib.rs: pub fn send(&mut self, out: &mut [u8]) -> Result<(usize, SendInfo)> { - self.send_on_path(out, None, None) + let ret = self.send_on_path(out, None, None); + self.streams.mark_stopped(3, true, 0); + ret } This must be backported up to 2.6 along with the preceeding commit : MINOR: mux-quic/h3: define close callback (cherry picked from commit 87f8766d3fbd10f9e8bf4902d37712612db64df5) Signed-off-by: Willy Tarreau (cherry picked from commit 670c2c30f6430cfd92fc4b32ee3b90af3660e900) Signed-off-by: Willy Tarreau --- src/h3.c | 18 +++++++++++++++++- src/mux_quic.c | 7 +++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/h3.c b/src/h3.c index 98160217d..ffa4e0402 100644 --- a/src/h3.c +++ b/src/h3.c @@ -1315,7 +1315,23 @@ static size_t h3_snd_buf(struct qcs *qcs, struct htx *htx, size_t count) */ static int h3_close(struct qcs *qcs, enum qcc_app_ops_close_side side) { - /* TODO */ + struct h3s *h3s = qcs->ctx; + struct h3c *h3c = h3s->h3c;; + + /* RFC 9114 6.2.1. Control Streams + * + * The sender + * MUST NOT close the control stream, and the receiver MUST NOT + * request that the sender close the control stream. If either + * control stream is closed at any point, this MUST be treated + * as a connection error of type H3_CLOSED_CRITICAL_STREAM. + */ + if (qcs == h3c->ctrl_strm) { + TRACE_ERROR("closure detected on control stream", H3_EV_H3S_END, qcs->qcc, qcs); + qcc_emit_cc_app(qcs->qcc, H3_CLOSED_CRITICAL_STREAM, 1); + return 1; + } + return 0; } diff --git a/src/mux_quic.c b/src/mux_quic.c index a723effa5..417e8d1db 100644 --- a/src/mux_quic.c +++ b/src/mux_quic.c @@ -1102,6 +1102,13 @@ int qcc_recv_stop_sending(struct qcc *qcc, uint64_t id, uint64_t err) qcs_idle_open(qcs); + if (qcc->app_ops->close) { + if (qcc->app_ops->close(qcs, QCC_APP_OPS_CLOSE_SIDE_WR)) { + TRACE_ERROR("closure rejected by app layer", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + goto out; + } + } + /* RFC 9000 3.5. Solicited State Transitions * * An endpoint that receives a STOP_SENDING frame From c076949887aa2c8496893f7a240c66e3f2b72e84 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Tue, 31 Jan 2023 14:12:28 +0100 Subject: [PATCH 009/140] BUG/MEDIUM: ssl: wrong eviction from the session cache tree When using WolfSSL, there are some cases were the SSL_CTX_sess_new_cb is called with an existing session ID. These cases are not met with OpenSSL. When the ID is found in the session tree during the insertion, the shared_block len is not set to 0 and is not used. However if later the block is reused, since the len is not set to 0, the release callback will be called an ebmb_delete will be tried on the block, even if it's not in the tree, provoking a crash. The code was buggy from the beginning, but the case never happen with openssl which changes the ID. Must be backported in every maintained branches. (cherry picked from commit 222e5a260bea5de940db2fd6cf19da2176ac8934) Signed-off-by: Willy Tarreau (cherry picked from commit 1eefaf64200454cf02e708f5e07646b55713b1a3) Signed-off-by: Willy Tarreau --- src/ssl_sock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ssl_sock.c b/src/ssl_sock.c index 6b95f9566..16106e95a 100644 --- a/src/ssl_sock.c +++ b/src/ssl_sock.c @@ -4509,6 +4509,7 @@ static int sh_ssl_sess_store(unsigned char *s_id, unsigned char *data, int data_ if (oldsh_ssl_sess != sh_ssl_sess) { /* NOTE: Row couldn't be in use because we lock read & write function */ /* release the reserved row */ + first->len = 0; /* the len must be liberated in order not to call the release callback on it */ shctx_row_dec_hot(ssl_shctx, first); /* replace the previous session already in the tree */ sh_ssl_sess = oldsh_ssl_sess; From a0b0d022707e1c8652fe70b609869a3ff9076539 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Tue, 31 Jan 2023 16:01:22 +0100 Subject: [PATCH 010/140] BUG/MINOR: h3: fix crash due to h3 traces First H3 traces argument must be a connection instance or a NULL. Some new traces were added recently with a qcc instance which caused a crash when traces are activated. This trace was added by the following patch : 87f8766d3fbd10f9e8bf4902d37712612db64df5 BUG/MEDIUM: h3: handle STOP_SENDING on control stream This must be backported up to 2.6 along with the above patch. (cherry picked from commit e31867b7facd54edf0b667bfb3e8d2dede50d86c) Signed-off-by: Willy Tarreau (cherry picked from commit 3cfbe9d053700446c059ee9b2c9056c867ef6686) [wt: adj ctx: test on H3S_T_CTRL is not in 2.6] Signed-off-by: Willy Tarreau --- src/h3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/h3.c b/src/h3.c index ffa4e0402..3b3db2497 100644 --- a/src/h3.c +++ b/src/h3.c @@ -1327,7 +1327,7 @@ static int h3_close(struct qcs *qcs, enum qcc_app_ops_close_side side) * as a connection error of type H3_CLOSED_CRITICAL_STREAM. */ if (qcs == h3c->ctrl_strm) { - TRACE_ERROR("closure detected on control stream", H3_EV_H3S_END, qcs->qcc, qcs); + TRACE_ERROR("closure detected on control stream", H3_EV_H3S_END, qcs->qcc->conn, qcs); qcc_emit_cc_app(qcs->qcc, H3_CLOSED_CRITICAL_STREAM, 1); return 1; } From 45ccbb3d4faafd4c096ad4713c59a58feaaa023d Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 2 Feb 2023 15:03:12 +0100 Subject: [PATCH 011/140] BUG/MINOR: stats: use proper buffer size for http dump In an attempt to fix GH #1873, ("BUG/MEDIUM: stats: Rely on a local trash buffer to dump the stats") explicitly reduced output buffer size to leave enough space for htx overhead under http context. Github user debtsandbooze, who first reported the issue, came back to us and said he was still able to make the http dump "hang" with the new fix. After some tests, it became clear that htx_add_data_atonce() could fail from time to time in stats_putchk(), even if htx was completely empty: In http context, buffer size is maxed out at channel_htx_recv_limit(). Unfortunately, channel_htx_recv_limit() is not what we're looking for here because limit() doesn't compute the proper htx overhead. Using buf_room_for_htx_data() instead of channel_htx_recv_limit() to compute max "usable" data space seems to be the last piece of work required for the previous fix to work properly. This should be backported everywhere the aforementioned commit is. (cherry picked from commit 5e7ecbec997e3112c8f1e30f9f2f2a719f6cc98e) Signed-off-by: Willy Tarreau (cherry picked from commit bc66f47532120c590a1d9aaa25b45263b543008a) Signed-off-by: Willy Tarreau --- src/stats.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/stats.c b/src/stats.c index 834e9bf8a..7fcfbd3a6 100644 --- a/src/stats.c +++ b/src/stats.c @@ -4369,7 +4369,11 @@ static void http_stats_io_handler(struct appctx *appctx) } if (appctx->st0 == STAT_HTTP_DUMP) { - trash_chunk = b_make(trash.area, channel_htx_recv_limit(res, res_htx), 0, 0); + trash_chunk = b_make(trash.area, trash.size, 0, 0); + /* adjust buffer size to take htx overhead into account, + * make sure to perform this call on an empty buffer + */ + trash_chunk.size = buf_room_for_htx_data(&trash_chunk); if (stats_dump_stat_to_buffer(sc, res_htx, s->be->uri_auth)) appctx->st0 = STAT_HTTP_DONE; } From f0356cf6d3a579e3ac2ddb6f569b2a0509f9a3d0 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Fri, 3 Feb 2023 08:31:42 +0100 Subject: [PATCH 012/140] BUG/MINOR: stats: fix source buffer size for http dump In ("BUG/MINOR: stats: use proper buffer size for http dump"), we used trash.size as source buffer size before applying the htx overhead computation. It is safer to use res->buf.size instead since res_htx (which is argument passed to stats_putchk() in http context) is made from res->buf: in http_stats_io_handler: | res_htx = htx_from_buf(&res->buf); This will prevent the hang bug from showing up again if res->buf.size were to be less than trash.size (which is set according to tune.bufsize). This should be backported with ("BUG/MINOR: stats: use proper buffer size for http dump") (cherry picked from commit 14656844cc68794d0c6994c10a07a5f7ebce50f6) Signed-off-by: Willy Tarreau (cherry picked from commit cf5cc5bdb43e7b1185455975c530ddc1ab71b17d) Signed-off-by: Willy Tarreau --- src/stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stats.c b/src/stats.c index 7fcfbd3a6..ce5fda177 100644 --- a/src/stats.c +++ b/src/stats.c @@ -4369,7 +4369,7 @@ static void http_stats_io_handler(struct appctx *appctx) } if (appctx->st0 == STAT_HTTP_DUMP) { - trash_chunk = b_make(trash.area, trash.size, 0, 0); + trash_chunk = b_make(trash.area, res->buf.size, 0, 0); /* adjust buffer size to take htx overhead into account, * make sure to perform this call on an empty buffer */ From 2c3be3118097e53e1ce0e6ce116db2cb81535a59 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 2 Feb 2023 17:27:27 +0100 Subject: [PATCH 013/140] BUG/MEDIUM: stats: fix resolvers dump In ("BUG/MEDIUM: stats: Rely on a local trash buffer to dump the stats"), we forgot to apply the patch in resolvers.c which provides the stats_dump_resolvers() function that is involved when dumping with "resolvers" domain. As a consequence, resolvers dump was broken because stats_dump_one_line(), which is used in stats_dump_resolv_to_buffer(), implicitely uses trash_chunk from stats.c to prepare the dump, and stats_putchk() is then called with global trash (currently empty) as output data. Given that trash_dump variable is static and thus only available within stats.c we change stats_putchk() function prototype so that the function does not take the output buffer as an argument. Instead, stats_putchk() will implicitly use the local trash_dump variable declared in stats.c. It will also prevent further mixups between stats_dump_* functions and stats_putchk(). This needs to be backported with ("BUG/MEDIUM: stats: Rely on a local trash buffer to dump the stats") (cherry picked from commit e5958d0292222c6cc122b1b19b79c959ec27370b) Signed-off-by: Willy Tarreau (cherry picked from commit 1a8db96980c972b10805dfc8f8287a86c46d0d81) Signed-off-by: Willy Tarreau --- include/haproxy/stats.h | 2 +- src/resolvers.c | 2 +- src/stats.c | 22 ++++++++++++---------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/include/haproxy/stats.h b/include/haproxy/stats.h index a8ffbc704..90a4f51ef 100644 --- a/include/haproxy/stats.h +++ b/include/haproxy/stats.h @@ -45,7 +45,7 @@ extern THREAD_LOCAL struct field info[]; extern THREAD_LOCAL struct field *stat_l[]; struct htx; -int stats_putchk(struct channel *chn, struct htx *htx, struct buffer *chk); +int stats_putchk(struct channel *chn, struct htx *htx); int stats_dump_one_line(const struct field *stats, size_t stats_count, struct appctx *appctx); diff --git a/src/resolvers.c b/src/resolvers.c index bbb881821..be22d551d 100644 --- a/src/resolvers.c +++ b/src/resolvers.c @@ -2640,7 +2640,7 @@ static int stats_dump_resolv_to_buffer(struct stconn *sc, if (!stats_dump_one_line(stats, idx, appctx)) return 0; - if (!stats_putchk(rep, NULL, &trash)) + if (!stats_putchk(rep, NULL)) goto full; return 1; diff --git a/src/stats.c b/src/stats.c index ce5fda177..be3a60f53 100644 --- a/src/stats.c +++ b/src/stats.c @@ -296,8 +296,10 @@ static inline enum stats_domain_px_cap stats_px_get_cap(uint32_t domain) static void stats_dump_json_schema(struct buffer *out); -int stats_putchk(struct channel *chn, struct htx *htx, struct buffer *chk) +int stats_putchk(struct channel *chn, struct htx *htx) { + struct buffer *chk = &trash_chunk; + if (htx) { if (chk->data >= channel_htx_recv_max(chn, htx)) return 0; @@ -3110,7 +3112,7 @@ more: case STAT_PX_ST_TH: if (ctx->flags & STAT_FMT_HTML) { stats_dump_html_px_hdr(sc, px); - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; } @@ -3120,7 +3122,7 @@ more: case STAT_PX_ST_FE: /* print the frontend */ if (stats_dump_fe_stats(sc, px)) { - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; if (ctx->field) goto more; @@ -3157,7 +3159,7 @@ more: /* print the frontend */ if (stats_dump_li_stats(sc, px, l)) { - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; if (ctx->field) goto more; @@ -3222,7 +3224,7 @@ more: } if (stats_dump_sv_stats(sc, px, sv)) { - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; } } /* for sv */ @@ -3233,7 +3235,7 @@ more: case STAT_PX_ST_BE: /* print the backend */ if (stats_dump_be_stats(sc, px)) { - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; if (ctx->field) goto more; @@ -3246,7 +3248,7 @@ more: case STAT_PX_ST_END: if (ctx->flags & STAT_FMT_HTML) { stats_dump_html_px_end(sc, px); - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; } @@ -3797,7 +3799,7 @@ static int stats_dump_stat_to_buffer(struct stconn *sc, struct htx *htx, else if (!(ctx->flags & STAT_FMT_TYPED)) stats_dump_csv_header(ctx->domain); - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; if (ctx->flags & STAT_JSON_SCHM) { @@ -3810,7 +3812,7 @@ static int stats_dump_stat_to_buffer(struct stconn *sc, struct htx *htx, case STAT_STATE_INFO: if (ctx->flags & STAT_FMT_HTML) { stats_dump_html_info(sc, uri); - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; } @@ -3849,7 +3851,7 @@ static int stats_dump_stat_to_buffer(struct stconn *sc, struct htx *htx, stats_dump_html_end(); else stats_dump_json_end(); - if (!stats_putchk(rep, htx, &trash_chunk)) + if (!stats_putchk(rep, htx)) goto full; } From 6c8503187d40a3126843a2ab384ba79e6f845dc6 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Fri, 3 Feb 2023 11:43:05 +0100 Subject: [PATCH 014/140] BUG/MINOR: stats: fix ctx->field update in stats_dump_proxy_to_buffer() When ctx->field was introduced with ("MINOR: stats: introduce stats field ctx") a mistake was made for the STAT_PX_ST_LI state in stats_dump_proxy_to_buffer(): current_field reset is placed after the for loop, ie: after multiple lines are dumped. Instead it should be placed right after each li line is dumped. This could cause some output inconsistencies (missing fields), especially when http dump is used with JSON output and "socket-stats" option is enabled on the proxy, because when htx is full we restore the ctx->field with current_field (which contains outdated value in this case). This should be backported with ("MINOR: stats: introduce stats field ctx") (cherry picked from commit 9b07d4fecd9451ad1e373c6c52c0547885c29920) Signed-off-by: Willy Tarreau (cherry picked from commit 9bec221ef68049ee806058b518fd8c91d3922db8) Signed-off-by: Willy Tarreau --- src/stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/stats.c b/src/stats.c index be3a60f53..b7e1f62d0 100644 --- a/src/stats.c +++ b/src/stats.c @@ -3164,9 +3164,9 @@ more: if (ctx->field) goto more; } + current_field = 0; } - current_field = 0; ctx->obj2 = px->srv; /* may be NULL */ ctx->px_st = STAT_PX_ST_SV; /* fall through */ From 120ad38e86b5b72e505a6f87caeea89b300a6ebe Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 2 Feb 2023 18:13:30 +0100 Subject: [PATCH 015/140] BUG/MINOR: stats: fix show stats field ctx for servers In ("MINOR: stats: introduce stats field ctx"), we forgot to apply the patch to servers. This prevents "BUG/MINOR: stats: fix show stat json buffer limitation" from working with servers dump. We're adding the missing part related to servers dump. This commit should be backported with the aforementioned commits. (cherry picked from commit 28a23617cebbb0c516eeec944e18ba20db72184f) Signed-off-by: Willy Tarreau (cherry picked from commit 08c1f203a8477486bc2d5a57c57fbe881b9819f9) Signed-off-by: Willy Tarreau --- src/stats.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/stats.c b/src/stats.c index b7e1f62d0..98fea8859 100644 --- a/src/stats.c +++ b/src/stats.c @@ -3226,7 +3226,10 @@ more: if (stats_dump_sv_stats(sc, px, sv)) { if (!stats_putchk(rep, htx)) goto full; + if (ctx->field) + goto more; } + current_field = 0; } /* for sv */ ctx->px_st = STAT_PX_ST_BE; From b39f55d9d55eae8e256bd6a195b5c94710d2a10b Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 2 Feb 2023 19:01:02 +0100 Subject: [PATCH 016/140] BUG/MINOR: stats: fix STAT_STARTED behavior with full htx When stats_putchk() fails to peform the dump because available data space in htx is less than the number of bytes pending in the dump buffer, we wait for more room in the htx (ie: sc_need_room()) to retry the dump attempt on the next applet invocation. To provide consistent output, we have to make sure that the stat ctx is not updated (or at least correctly reverted) in case stats_putchk() fails so that the new dumping attempt behaves just like the previous (failed) one. STAT_STARTED is not following this logic, the flag is set in stats_dump_fields_json() as soon as some data is written to the output buffer. It's done too early: we need to delay this step after the stats_putchk() has successfully returned if we want to correctly handle the retries attempts. Because of this, JSON output could suffer from extraneous ',' characters which could make json parsers unhappy. For example, this is the kind of errors you could get when using `python -m json.tool` on such badly formatted outputs: "Expecting value: line 1 column 2 (char 1)" Unfortunately, fixing this means that the flag needs to be enabled at multiple places, which is what we're doing in this patch. (in stats_dump_proxy_to_buffer() where stats_dump_one_line() is involved by underlying stats_dump_{fe,li,sv,be} functions) Thereby, this raises the need for a cleanup to reduce code duplication around stats_dump_proxy_to_buffer() function and simplify things a bit. It could be backported to 2.6 and 2.7 (cherry picked from commit 90304dcdd8048e88cbfcc55acdd6202cb416ba93) Signed-off-by: Willy Tarreau (cherry picked from commit 0ab90dd225f6c7b997bcc28b02cf9267d2f204be) Signed-off-by: Willy Tarreau --- src/stats.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/stats.c b/src/stats.c index 98fea8859..cb374dccb 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1650,9 +1650,6 @@ int stats_dump_one_line(const struct field *stats, size_t stats_count, else ret = stats_dump_fields_csv(&trash_chunk, stats, stats_count, ctx); - if (ret) - ctx->flags |= STAT_STARTED; - return ret; } @@ -3124,6 +3121,7 @@ more: if (stats_dump_fe_stats(sc, px)) { if (!stats_putchk(rep, htx)) goto full; + ctx->flags |= STAT_STARTED; if (ctx->field) goto more; } @@ -3161,6 +3159,7 @@ more: if (stats_dump_li_stats(sc, px, l)) { if (!stats_putchk(rep, htx)) goto full; + ctx->flags |= STAT_STARTED; if (ctx->field) goto more; } @@ -3226,6 +3225,7 @@ more: if (stats_dump_sv_stats(sc, px, sv)) { if (!stats_putchk(rep, htx)) goto full; + ctx->flags |= STAT_STARTED; if (ctx->field) goto more; } @@ -3240,6 +3240,7 @@ more: if (stats_dump_be_stats(sc, px)) { if (!stats_putchk(rep, htx)) goto full; + ctx->flags |= STAT_STARTED; if (ctx->field) goto more; } From 2162766fe8d71c2b45b3cecdba6b8caba0e3f351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Thu, 26 Jan 2023 15:07:39 +0100 Subject: [PATCH 017/140] BUG/MINOR: quic: Possible stream truncations under heavy loss This may happen during retransmission of frames which can be splitted (CRYPTO, or STREAM frames). One may have to split a frame to be retransmitted due to the QUIC protocol properties (packet size limitation and packet field encoding sizes). The remaining part of a frame which cannot be retransmitted must be detached from the original frame it is copied from. If not, when the really sent part will be acknowledged the remaining part will be acknowledged too but not sent! Must be backported to 2.7 and 2.6. (cherry picked from commit dd419461eff5395b369f5497fef703dc383577bb) Signed-off-by: Willy Tarreau (cherry picked from commit 42331e27db41499fd2f4a08bacca6bbe468debcd) Signed-off-by: Willy Tarreau --- src/quic_conn.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/quic_conn.c b/src/quic_conn.c index 371a8d8dd..90b14e996 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -6655,6 +6655,9 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, /* This frame was duplicated */ LIST_APPEND(&cf->origin->reflist, &new_cf->ref); new_cf->origin = cf->origin; + /* Detach the remaining CRYPTO frame from its original frame */ + LIST_DEL_INIT(&cf->ref); + cf->origin = NULL; } LIST_APPEND(outlist, &new_cf->list); /* Consume bytes of the current frame. */ @@ -6772,6 +6775,9 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, /* This frame was duplicated */ LIST_APPEND(&cf->origin->reflist, &new_cf->ref); new_cf->origin = cf->origin; + /* Detach this STREAM frame from its origin */ + LIST_DEL_INIT(&cf->ref); + cf->origin = NULL; } LIST_APPEND(outlist, &new_cf->list); cf->type |= QUIC_STREAM_FRAME_TYPE_OFF_BIT; From efaf93db402f33eb12397631cfd31c7cf6b1d248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Thu, 26 Jan 2023 15:18:17 +0100 Subject: [PATCH 018/140] BUG/MINOR: quic: Too big PTO during handshakes During the handshake and when the handshake has not been confirmed the acknowledgement delays reported by the peer may be larger than max_ack_delay. max_ack_delay SHOULD be ignored before the handshake is completed when computing the PTO. But the current code considered the wrong condition "before the hanshake is completed". Replace the enum value QUIC_HS_ST_COMPLETED by QUIC_HS_ST_CONFIRMED to fix this issue. In quic_loss.c, the parameter passed to quic_pto_pktns() is renamed to avoid any possible confusion. Must be backported to 2.7 and 2.6. (cherry picked from commit b75eecc87413da681ce40ce39231680fb5c2cabe) Signed-off-by: Willy Tarreau (cherry picked from commit 44d0398efb2e12f0b37ccd1811b3ad82b794cbdb) Signed-off-by: Willy Tarreau --- src/quic_conn.c | 8 ++++---- src/quic_loss.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 90b14e996..d780d5d71 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -726,7 +726,7 @@ static inline void qc_set_timer(struct quic_conn *qc) { struct quic_pktns *pktns; unsigned int pto; - int handshake_complete; + int handshake_confirmed; TRACE_ENTER(QUIC_EV_CONN_STIMER, qc, NULL, NULL, &qc->path->ifae_pkts); @@ -754,8 +754,8 @@ static inline void qc_set_timer(struct quic_conn *qc) goto out; } - handshake_complete = qc->state >= QUIC_HS_ST_COMPLETE; - pktns = quic_pto_pktns(qc, handshake_complete, &pto); + handshake_confirmed = qc->state >= QUIC_HS_ST_CONFIRMED; + pktns = quic_pto_pktns(qc, handshake_confirmed, &pto); if (tick_isset(pto)) qc->timer = pto; out: @@ -4651,7 +4651,7 @@ struct task *qc_process_timer(struct task *task, void *ctx, unsigned int state) } if (qc->path->in_flight) { - pktns = quic_pto_pktns(qc, qc->state >= QUIC_HS_ST_COMPLETE, NULL); + pktns = quic_pto_pktns(qc, qc->state >= QUIC_HS_ST_CONFIRMED, NULL); if (qc->subs && qc->subs->events & SUB_RETRY_SEND) { pktns->tx.pto_probe = QUIC_MAX_NB_PTO_DGRAMS; tasklet_wakeup(qc->subs->tasklet); diff --git a/src/quic_loss.c b/src/quic_loss.c index 0c7c3a1d9..a92b69942 100644 --- a/src/quic_loss.c +++ b/src/quic_loss.c @@ -80,7 +80,7 @@ struct quic_pktns *quic_loss_pktns(struct quic_conn *qc) * as PTO value if not. */ struct quic_pktns *quic_pto_pktns(struct quic_conn *qc, - int handshake_completed, + int handshake_confirmed, unsigned int *pto) { int i; @@ -117,7 +117,7 @@ struct quic_pktns *quic_pto_pktns(struct quic_conn *qc, continue; if (i == QUIC_TLS_PKTNS_01RTT) { - if (!handshake_completed) { + if (!handshake_confirmed) { TRACE_STATE("handshake not already completed", QUIC_EV_CONN_SPTO, qc); pktns = p; goto out; From 0a1d0d32263ef089885778e58996dae738c8dbe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Tue, 31 Jan 2023 10:10:06 +0100 Subject: [PATCH 019/140] BUG/MINOR: quic: Do not ignore coalesced packets in qc_prep_fast_retrans() This function is called only when probing only one packet number space (Handshake) or two times the same one (Application). So, there is no risk to prepare two times the same frame when uneeded because we wanted to probe two packet number spaces. The condition "ignore the packets which has been coalesced to another one" is not necessary. More importantly the bug is when we want to prepare a Application packet which has been coalesced to an Handshake packet. This is always the case when the first Application packet is sent. It is always coalesced to an Handshake packet with an ACK frame. So, when lost, this first application packet was never resent. It contains the HANDSHAKE_DONE frame to confirm the completion of the handshake to the client. Must be backported to 2.6 and 2.7. (cherry picked from commit 055e82657ed10017017d4d9f0d6b2bfe96bc0a07) Signed-off-by: Willy Tarreau (cherry picked from commit 36eeeea8449cdb54e170c05c577e846ea5515d06) [wt: adj ctx: no TRACE_PRINTF in 2.6] Signed-off-by: Willy Tarreau --- src/quic_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index d780d5d71..9f7ea4e08 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2476,7 +2476,7 @@ static void qc_prep_fast_retrans(struct quic_conn *qc, pkt = eb64_entry(node, struct quic_tx_packet, pn_node); node = eb64_next(node); /* Skip the empty and coalesced packets */ - if (!LIST_ISEMPTY(&pkt->frms) && !(pkt->flags & QUIC_FL_TX_PACKET_COALESCED)) + if (!LIST_ISEMPTY(&pkt->frms)) break; } From 20fd465d237f39195bb3821b2d6ceeb090cc7ca9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Tue, 31 Jan 2023 17:32:06 +0100 Subject: [PATCH 020/140] MINOR: quic: When probing Handshake packet number space, also probe the Initial one This is not really a bug fix but an improvement. When the Handshake packet number space has been detected as needed to be probed, we should also try to probe the Initial packet number space if there are still packets in flight. Furthermore we should also try to send up to two datagrams. Must be backported to 2.6 and 2.7. (cherry picked from commit 37ed4a3842d0f8c4a4f77c5f57b2dc1f0401a7e1) Signed-off-by: Willy Tarreau (cherry picked from commit 1293092e1f8ce50686a787afa7d35c898bd76918) Signed-off-by: Willy Tarreau --- src/quic_conn.c | 44 +++++++++++++++++--------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 9f7ea4e08..302c8cc9c 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -4211,16 +4211,19 @@ static void qc_dgrams_retransmit(struct quic_conn *qc) TRACE_ENTER(QUIC_EV_CONN_TXPKT, qc); if (iqel->pktns->flags & QUIC_FL_PKTNS_PROBE_NEEDED) { - struct list ifrms = LIST_HEAD_INIT(ifrms); - struct list hfrms = LIST_HEAD_INIT(hfrms); + int i; - qc_prep_hdshk_fast_retrans(qc, &ifrms, &hfrms); - TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &ifrms); - TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &hfrms); - if (!LIST_ISEMPTY(&ifrms)) { - iqel->pktns->tx.pto_probe = 1; - if (!LIST_ISEMPTY(&hfrms)) { - hqel->pktns->tx.pto_probe = 1; + for (i = 0; i < QUIC_MAX_NB_PTO_DGRAMS; i++) { + struct list ifrms = LIST_HEAD_INIT(ifrms); + struct list hfrms = LIST_HEAD_INIT(hfrms); + + qc_prep_hdshk_fast_retrans(qc, &ifrms, &hfrms); + TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &ifrms); + TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &hfrms); + if (!LIST_ISEMPTY(&ifrms)) { + iqel->pktns->tx.pto_probe = 1; + if (!LIST_ISEMPTY(&hfrms)) + hqel->pktns->tx.pto_probe = 1; qc_send_hdshk_pkts(qc, 1, QUIC_TLS_ENC_LEVEL_INITIAL, &ifrms, QUIC_TLS_ENC_LEVEL_HANDSHAKE, &hfrms); /* Put back unsent frames in their packet number spaces */ @@ -4228,27 +4231,10 @@ static void qc_dgrams_retransmit(struct quic_conn *qc) LIST_SPLICE(&hqel->pktns->tx.frms, &hfrms); } } - if (hqel->pktns->flags & QUIC_FL_PKTNS_PROBE_NEEDED) { - /* This list has potentially been already used and spliced - * to another one attached to the connection. We must reinitialize it. - */ - LIST_INIT(&hfrms); - qc_prep_fast_retrans(qc, hqel, &hfrms, NULL); - TRACE_DEVEL("Avail. ack eliciting frames", QUIC_EV_CONN_FRMLIST, qc, &hfrms); - if (!LIST_ISEMPTY(&hfrms)) { - hqel->pktns->tx.pto_probe = 1; - qc_send_hdshk_pkts(qc, 1, QUIC_TLS_ENC_LEVEL_HANDSHAKE, &hfrms, - QUIC_TLS_ENC_LEVEL_NONE, NULL); - /* Put back unsent frames into their packet number spaces */ - LIST_SPLICE(&hqel->pktns->tx.frms, &hfrms); - } - TRACE_STATE("no more need to probe Handshake packet number space", - QUIC_EV_CONN_TXPKT, qc); - hqel->pktns->flags &= ~QUIC_FL_PKTNS_PROBE_NEEDED; - } TRACE_STATE("no more need to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); iqel->pktns->flags &= ~QUIC_FL_PKTNS_PROBE_NEEDED; + hqel->pktns->flags &= ~QUIC_FL_PKTNS_PROBE_NEEDED; } else { int i; @@ -4671,6 +4657,10 @@ struct task *qc_process_timer(struct task *task, void *ctx, unsigned int state) } else if (pktns == &qc->pktns[QUIC_TLS_PKTNS_HANDSHAKE]) { TRACE_STATE("needs to probe Handshake packet number space", QUIC_EV_CONN_TXPKT, qc); + if (qc->pktns[QUIC_TLS_PKTNS_INITIAL].tx.in_flight) { + qc->pktns[QUIC_TLS_PKTNS_INITIAL].flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + TRACE_STATE("needs to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } } else if (pktns == &qc->pktns[QUIC_TLS_PKTNS_01RTT]) { TRACE_STATE("needs to probe 01RTT packet number space", QUIC_EV_CONN_TXPKT, qc); From ef73f69075663185ad7df20a62da43441a9d5fbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Wed, 1 Feb 2023 10:31:35 +0100 Subject: [PATCH 021/140] BUG/MAJOR: quic: Possible crash when processing 1-RTT during 0-RTT session This bug was revealed by some C1 interop tests (heavy hanshake packet corruption) when receiving 1-RTT packets with a key phase update. This lead the packet to be decrypted with the next key phase secrets. But this latter is initialized only after the handshake is complete. In fact, 1-RTT must never be processed before the handshake is complete. Relying on the "qc->mux_state == QC_MUX_NULL" condition to check the handshake is complete is wrong during 0-RTT sessions when the mux is initialized before the handshake is complete. Must be backported to 2.7 and 2.6. (cherry picked from commit 8417beb7da2b32c33cd703e7a123125c6b0df7b3) Signed-off-by: Willy Tarreau (cherry picked from commit 898de4ac6754cd4b9552116b2ff07e456f5c47f5) Signed-off-by: Willy Tarreau --- src/quic_conn.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/quic_conn.c b/src/quic_conn.c index 302c8cc9c..fc94ea7d0 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -4036,6 +4036,11 @@ static int qc_qel_may_rm_hp(struct quic_conn *qc, struct quic_enc_level *qel) goto cant_rm_hp; } + if (tel == QUIC_TLS_ENC_LEVEL_APP && qc->state < QUIC_HS_ST_COMPLETE) { + TRACE_DEVEL("handshake not complete", QUIC_EV_CONN_TRMHP, qc); + goto cant_rm_hp; + } + /* check if the connection layer is ready before using app level */ if ((tel == QUIC_TLS_ENC_LEVEL_APP || tel == QUIC_TLS_ENC_LEVEL_EARLY_DATA) && qc->mux_state == QC_MUX_NULL) { From 9e63d2a10ad1807eaca6cdbc912893b1fd61a049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Wed, 1 Feb 2023 17:56:57 +0100 Subject: [PATCH 022/140] MEDIUM: quic: Remove qc_conn_finalize() from the ClientHello TLS callbacks This is a bad idea to make the TLS ClientHello callback call qc_conn_finalize(). If this latter fails, this would generate a TLS alert and make the connection send packet whereas it is not functional. But qc_conn_finalize() job was to install the transport parameters sent by the QUIC listener. This installation cannot be done at any time. This must be done after having possibly negotiated the QUIC version and before sending the first Handshake packets. It seems the better moment to do that in when the Handshake TX secrets are derived. This has been found inspecting the ngtcp2 code. Calling SSL_set_quic_transport_params() too late would make the ServerHello to be sent without the transport parameters. The code for the connection update which was done from qc_conn_finalize() has been moved to quic_transport_params_store(). So, this update is done as soon as possible. Add QUIC_FL_CONN_TX_TP_RECEIVED to flag the connection as having received the peer transport parameters. Indeed this is required when the ClientHello message is splitted between packets. Add QUIC_FL_CONN_FINALIZED to protect the connection from calling qc_conn_finalize() more than one time. This latter is called only when the connection has received the transport parameters and after returning from SSL_do_hanshake() which is the function which trigger the TLS ClientHello callback call. Remove the calls to qc_conn_finalize() from from the TLS ClientHello callbacks. Must be backported to 2.6. and 2.7. (cherry picked from commit af25a69c8b0faa57a27dd5e9a99cbbe350ee6b07) Signed-off-by: Willy Tarreau (cherry picked from commit 933a302947f615d4407bb327f8ded2dbc915f031) Signed-off-by: Willy Tarreau --- include/haproxy/quic_conn-t.h | 3 + include/haproxy/quic_conn.h | 1 - src/quic_conn.c | 126 ++++++++++++++++------------------ src/quic_tp.c | 18 +++++ src/ssl_sock.c | 11 +-- 5 files changed, 88 insertions(+), 71 deletions(-) diff --git a/include/haproxy/quic_conn-t.h b/include/haproxy/quic_conn-t.h index 6c3948b7d..1f2de5aaa 100644 --- a/include/haproxy/quic_conn-t.h +++ b/include/haproxy/quic_conn-t.h @@ -616,11 +616,14 @@ enum qc_mux_state { /* gap here */ #define QUIC_FL_CONN_HALF_OPEN_CNT_DECREMENTED (1U << 11) /* The half-open connection counter was decremented */ #define QUIC_FL_CONN_HANDSHAKE_SPEED_UP (1U << 12) /* Handshake speeding up was done */ +#define QUIC_FL_CONN_TX_TP_RECEIVED (1U << 25) /* Peer transport parameters have been received (used for the transmitting part) */ +#define QUIC_FL_CONN_FINALIZED (1U << 26) /* QUIC connection finalized (functional, ready to send/receive) */ #define QUIC_FL_CONN_NOTIFY_CLOSE (1U << 27) /* MUX notified about quic-conn imminent closure (idle-timeout or CONNECTION_CLOSE emission/reception) */ #define QUIC_FL_CONN_EXP_TIMER (1U << 28) /* timer has expired, quic-conn can be freed */ #define QUIC_FL_CONN_CLOSING (1U << 29) #define QUIC_FL_CONN_DRAINING (1U << 30) #define QUIC_FL_CONN_IMMEDIATE_CLOSE (1U << 31) + struct quic_conn { const struct quic_version *original_version; const struct quic_version *negotiated_version; diff --git a/include/haproxy/quic_conn.h b/include/haproxy/quic_conn.h index 3fc9458bc..6bcfa5900 100644 --- a/include/haproxy/quic_conn.h +++ b/include/haproxy/quic_conn.h @@ -49,7 +49,6 @@ extern struct pool_head *pool_head_quic_connection_id; -int qc_conn_finalize(struct quic_conn *qc, int server); int ssl_quic_initial_ctx(struct bind_conf *bind_conf); /* Return the long packet type matching with version and */ diff --git a/src/quic_conn.c b/src/quic_conn.c index fc94ea7d0..4d56981cd 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -994,6 +994,22 @@ write: goto leave; } + if (level == ssl_encryption_handshake && qc_is_listener(qc)) { + qc->enc_params_len = + quic_transport_params_encode(qc->enc_params, + qc->enc_params + sizeof qc->enc_params, + &qc->rx.params, ver, 1); + if (!qc->enc_params_len) { + TRACE_ERROR("quic_transport_params_encode() failed", QUIC_EV_CONN_RWSEC); + goto leave; + } + + if (!SSL_set_quic_transport_params(qc->xprt_ctx->ssl, qc->enc_params, qc->enc_params_len)) { + TRACE_ERROR("SSL_set_quic_transport_params() failed", QUIC_EV_CONN_RWSEC); + goto leave; + } + } + if (level == ssl_encryption_application) { struct quic_tls_kp *prv_rx = &qc->ku.prv_rx; struct quic_tls_kp *nxt_rx = &qc->ku.nxt_rx; @@ -2217,6 +2233,41 @@ static forceinline void qc_ssl_dump_errors(struct connection *conn) int ssl_sock_get_alpn(const struct connection *conn, void *xprt_ctx, const char **str, int *len); +/* Finalize QUIC connection: + * - initialize the Initial QUIC TLS context for negotiated version, + * - derive the secrets for this context, + * - set them into the TLS stack, + * + * MUST be called after having received the remote transport parameters which + * are parsed when the TLS callback for the ClientHello message is called upon + * SSL_do_handshake() calls, not necessarily at the first time as this TLS + * message may be splitted between packets + * Return 1 if succeeded, 0 if not. + */ +static int qc_conn_finalize(struct quic_conn *qc, int server) +{ + int ret = 0; + + TRACE_ENTER(QUIC_EV_CONN_NEW, qc); + + if (qc->flags & QUIC_FL_CONN_FINALIZED) + goto finalized; + + if (qc->negotiated_version && + !qc_new_isecs(qc, &qc->negotiated_ictx, qc->negotiated_version, + qc->odcid.data, qc->odcid.len, server)) + goto out; + + /* This connection is functional (ready to send/receive) */ + qc->flags |= QUIC_FL_CONN_FINALIZED; + + finalized: + ret = 1; + out: + TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); + return ret; +} + /* Provide CRYPTO data to the TLS stack found at with as length * from encryption level with as QUIC connection context. * Remaining parameter are there for debugging purposes. @@ -2253,6 +2304,16 @@ static inline int qc_provide_cdata(struct quic_enc_level *el, state = qc->state; if (state < QUIC_HS_ST_COMPLETE) { ssl_err = SSL_do_handshake(ctx->ssl); + + /* Finalize the connection as soon as possible if the peer transport parameters + * have been received. This may be useful to send packets even if this + * handshake fails. + */ + if ((qc->flags & QUIC_FL_CONN_TX_TP_RECEIVED) && !qc_conn_finalize(qc, 1)) { + TRACE_ERROR("connection finalization failed", QUIC_EV_CONN_IO_CB, qc, &state); + goto leave; + } + if (ssl_err != 1) { ssl_err = SSL_get_error(ctx->ssl, ssl_err); if (ssl_err == SSL_ERROR_WANT_READ || ssl_err == SSL_ERROR_WANT_WRITE) { @@ -5860,71 +5921,6 @@ static int qc_ssl_sess_init(struct quic_conn *qc, SSL_CTX *ssl_ctx, SSL **ssl, goto leave; } -/* Finalize QUIC connection: - * - initialize the Initial QUIC TLS context for negotiated version, - * - derive the secrets for this context, - * - encode the transport parameters to be sent, - * - set them into the TLS stack, - * - initialize ->max_ack_delay and max_idle_timeout, - * - * MUST be called after having received the remote transport parameters. - * Return 1 if succeeded, 0 if not. - */ -int qc_conn_finalize(struct quic_conn *qc, int server) -{ - int ret = 0; - struct quic_transport_params *tx_tp = &qc->tx.params; - struct quic_transport_params *rx_tp = &qc->rx.params; - const struct quic_version *ver; - - TRACE_ENTER(QUIC_EV_CONN_NEW, qc); - - if (tx_tp->version_information.negotiated_version && - tx_tp->version_information.negotiated_version != qc->original_version) { - qc->negotiated_version = - qc->tx.params.version_information.negotiated_version; - if (!qc_new_isecs(qc, &qc->negotiated_ictx, qc->negotiated_version, - qc->odcid.data, qc->odcid.len, !server)) - goto out; - - ver = qc->negotiated_version; - } - else { - ver = qc->original_version; - } - - qc->enc_params_len = - quic_transport_params_encode(qc->enc_params, - qc->enc_params + sizeof qc->enc_params, - &qc->rx.params, ver, 1); - if (!qc->enc_params_len) { - TRACE_ERROR("quic_transport_params_encode() failed", QUIC_EV_CONN_TXPKT); - goto out; - } - - if (!SSL_set_quic_transport_params(qc->xprt_ctx->ssl, qc->enc_params, qc->enc_params_len)) { - TRACE_ERROR("SSL_set_quic_transport_params() failed", QUIC_EV_CONN_TXPKT); - goto out; - } - - if (tx_tp->max_ack_delay) - qc->max_ack_delay = tx_tp->max_ack_delay; - - if (tx_tp->max_idle_timeout && rx_tp->max_idle_timeout) - qc->max_idle_timeout = - QUIC_MIN(tx_tp->max_idle_timeout, rx_tp->max_idle_timeout); - else - qc->max_idle_timeout = - QUIC_MAX(tx_tp->max_idle_timeout, rx_tp->max_idle_timeout); - - TRACE_PROTO("\nTX(remote) transp. params.", QUIC_EV_TRANSP_PARAMS, qc, tx_tp); - - ret = 1; - out: - TRACE_LEAVE(QUIC_EV_CONN_NEW, qc); - return ret; -} - /* Allocate the ssl_sock_ctx from connection . This creates the tasklet * used to process received packets. The allocated context is stored in * . diff --git a/src/quic_tp.c b/src/quic_tp.c index 2be6d5150..bdd63190d 100644 --- a/src/quic_tp.c +++ b/src/quic_tp.c @@ -624,6 +624,7 @@ int quic_transport_params_store(struct quic_conn *qc, int server, const unsigned char *end) { struct quic_transport_params *tx_params = &qc->tx.params; + struct quic_transport_params *rx_params = &qc->rx.params; /* initialize peer TPs to RFC default value */ quic_dflt_transport_params_cpy(tx_params); @@ -631,6 +632,23 @@ int quic_transport_params_store(struct quic_conn *qc, int server, if (!quic_transport_params_decode(tx_params, server, buf, end)) return 0; + /* Update the connection from transport parameters received */ + if (tx_params->version_information.negotiated_version && + tx_params->version_information.negotiated_version != qc->original_version) + qc->negotiated_version = + qc->tx.params.version_information.negotiated_version; + + if (tx_params->max_ack_delay) + qc->max_ack_delay = tx_params->max_ack_delay; + + if (tx_params->max_idle_timeout && rx_params->max_idle_timeout) + qc->max_idle_timeout = + QUIC_MIN(tx_params->max_idle_timeout, rx_params->max_idle_timeout); + else + qc->max_idle_timeout = + QUIC_MAX(tx_params->max_idle_timeout, rx_params->max_idle_timeout); + TRACE_PROTO("\nTX(remote) transp. params.", QUIC_EV_TRANSP_PARAMS, qc, tx_params); + return 1; } diff --git a/src/ssl_sock.c b/src/ssl_sock.c index 16106e95a..0e2569f80 100644 --- a/src/ssl_sock.c +++ b/src/ssl_sock.c @@ -2679,9 +2679,10 @@ int ssl_sock_switchctx_cbk(SSL *ssl, int *al, void *arg) } if (!quic_transport_params_store(qc, 0, extension_data, - extension_data + extension_len) || - !qc_conn_finalize(qc, 0)) + extension_data + extension_len)) goto abort; + + qc->flags |= QUIC_FL_CONN_TX_TP_RECEIVED; } #endif /* USE_QUIC */ @@ -2975,10 +2976,10 @@ int ssl_sock_switchctx_cbk(SSL *ssl, int *al, void *priv) } if (!quic_transport_params_store(qc, 0, extension_data, - extension_data + extension_len) || - !qc_conn_finalize(qc, 0)) { + extension_data + extension_len)) return SSL_TLSEXT_ERR_NOACK; - } + + qc->flags |= QUIC_FL_CONN_TX_TP_RECEIVED; } #endif /* USE_QUIC */ From b9d076d9b8abe891dfea5b6401d236212dd91d3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Fri, 3 Feb 2023 16:15:08 +0100 Subject: [PATCH 023/140] BUG/MINOR: quic: Unchecked source connection ID The SCID (source connection ID) used by a peer (client or server) is sent into the long header of a QUIC packet in clear. But it is also sent into the transport parameters (initial_source_connection_id). As these latter are encrypted into the packet, one must check that these two pieces of information do not differ due to a packet header corruption. Furthermore as such a connection is unusuable it must be killed and must stop as soon as possible processing RX/TX packets. Implement qc_kill_con() to flag a connection as unusable and to kille it asap waking up the idle timer task to release the connection. Add a check to quic_transport_params_store() to detect that the SCIDs do not match and make it call qc_kill_con(). Add several tests about connection to be killed at several critial locations, especially in the TLS stack callback to receive CRYPTO data from or derive secrets, and before preparing packet after having received others. Must be backported to 2.6 and 2.7. (cherry picked from commit 0aa79953c9779a2da092b3d0f6908726fe7013ec) Signed-off-by: Willy Tarreau (cherry picked from commit b8f3e25677d9433869846e47cefb71b9841e8f12) [wt: adj ctx: no quic_dgram_parse() in 2.6] Signed-off-by: Willy Tarreau --- include/haproxy/quic_conn-t.h | 1 + include/haproxy/quic_conn.h | 2 ++ src/quic_conn.c | 37 ++++++++++++++++++++++++++++++++++- src/quic_tp.c | 16 ++++++++++++++- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/include/haproxy/quic_conn-t.h b/include/haproxy/quic_conn-t.h index 1f2de5aaa..7e8634893 100644 --- a/include/haproxy/quic_conn-t.h +++ b/include/haproxy/quic_conn-t.h @@ -616,6 +616,7 @@ enum qc_mux_state { /* gap here */ #define QUIC_FL_CONN_HALF_OPEN_CNT_DECREMENTED (1U << 11) /* The half-open connection counter was decremented */ #define QUIC_FL_CONN_HANDSHAKE_SPEED_UP (1U << 12) /* Handshake speeding up was done */ +#define QUIC_FL_CONN_TO_KILL (1U << 24) /* Unusable connection, to be killed */ #define QUIC_FL_CONN_TX_TP_RECEIVED (1U << 25) /* Peer transport parameters have been received (used for the transmitting part) */ #define QUIC_FL_CONN_FINALIZED (1U << 26) /* QUIC connection finalized (functional, ready to send/receive) */ #define QUIC_FL_CONN_NOTIFY_CLOSE (1U << 27) /* MUX notified about quic-conn imminent closure (idle-timeout or CONNECTION_CLOSE emission/reception) */ diff --git a/include/haproxy/quic_conn.h b/include/haproxy/quic_conn.h index 6bcfa5900..ba0f18a0a 100644 --- a/include/haproxy/quic_conn.h +++ b/include/haproxy/quic_conn.h @@ -726,5 +726,7 @@ void qc_check_close_on_released_mux(struct quic_conn *qc); void quic_conn_release(struct quic_conn *qc); +void qc_kill_conn(struct quic_conn *qc); + #endif /* USE_QUIC */ #endif /* _HAPROXY_QUIC_CONN_H */ diff --git a/src/quic_conn.c b/src/quic_conn.c index 4d56981cd..3a1619b5f 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -271,7 +271,9 @@ static void quic_trace(enum trace_level level, uint64_t mask, const struct trace if (mask & QUIC_EV_TRANSP_PARAMS) { const struct quic_transport_params *p = a2; - quic_transport_params_dump(&trace_buf, qc, p); + + if (p) + quic_transport_params_dump(&trace_buf, qc, p); } if (mask & QUIC_EV_CONN_ADDDATA) { @@ -719,6 +721,13 @@ static inline int quic_peer_validated_addr(struct quic_conn *qc) return 0; } +/* To be called to kill a connection as soon as possible (without sending any packet). */ +void qc_kill_conn(struct quic_conn *qc) +{ + qc->flags |= QUIC_FL_CONN_TO_KILL; + task_wakeup(qc->idle_timer_task, TASK_WOKEN_OTHER); +} + /* Set the timer attached to the QUIC connection with as I/O handler and used for * both loss detection and PTO and schedule the task assiated to this timer if needed. */ @@ -918,6 +927,12 @@ int ha_quic_set_encryption_secrets(SSL *ssl, enum ssl_encryption_level_t level, TRACE_ENTER(QUIC_EV_CONN_RWSEC, qc); BUG_ON(secret_len > QUIC_TLS_SECRET_LEN); + + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_PROTO("connection to be killed", QUIC_EV_CONN_ADDDATA, qc); + goto out; + } + if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) { TRACE_PROTO("CC required", QUIC_EV_CONN_RWSEC, qc); goto out; @@ -1214,6 +1229,11 @@ int ha_quic_add_handshake_data(SSL *ssl, enum ssl_encryption_level_t level, qc = SSL_get_ex_data(ssl, ssl_qc_app_data_index); TRACE_ENTER(QUIC_EV_CONN_ADDDATA, qc); + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_PROTO("connection to be killed", QUIC_EV_CONN_ADDDATA, qc); + goto out; + } + if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) { TRACE_PROTO("CC required", QUIC_EV_CONN_ADDDATA, qc); goto out; @@ -2305,6 +2325,11 @@ static inline int qc_provide_cdata(struct quic_enc_level *el, if (state < QUIC_HS_ST_COMPLETE) { ssl_err = SSL_do_handshake(ctx->ssl); + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_DEVEL("connection to be killed", QUIC_EV_CONN_IO_CB, qc); + goto leave; + } + /* Finalize the connection as soon as possible if the peer transport parameters * have been received. This may be useful to send packets even if this * handshake fails. @@ -4378,6 +4403,11 @@ struct task *quic_conn_app_io_cb(struct task *t, void *context, unsigned int sta goto out; } + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_DEVEL("connection to be killed", QUIC_EV_CONN_IO_CB, qc); + goto out; + } + if ((qc->flags & QUIC_FL_CONN_DRAINING) && !(qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE)) { TRACE_STATE("draining connection (must not send packets)", QUIC_EV_CONN_IO_CB, qc); @@ -4470,6 +4500,11 @@ struct task *quic_conn_io_cb(struct task *t, void *context, unsigned int state) if (!qc_treat_rx_pkts(qc, qel, next_qel, force_ack)) goto out; + if (qc->flags & QUIC_FL_CONN_TO_KILL) { + TRACE_DEVEL("connection to be killed", QUIC_EV_CONN_PHPKTS, qc); + goto out; + } + if ((qc->flags & QUIC_FL_CONN_DRAINING) && !(qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE)) goto out; diff --git a/src/quic_tp.c b/src/quic_tp.c index bdd63190d..8eeb455ae 100644 --- a/src/quic_tp.c +++ b/src/quic_tp.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include @@ -625,6 +625,8 @@ int quic_transport_params_store(struct quic_conn *qc, int server, { struct quic_transport_params *tx_params = &qc->tx.params; struct quic_transport_params *rx_params = &qc->rx.params; + /* Initial source connection ID */ + struct tp_cid *iscid; /* initialize peer TPs to RFC default value */ quic_dflt_transport_params_cpy(tx_params); @@ -649,6 +651,18 @@ int quic_transport_params_store(struct quic_conn *qc, int server, QUIC_MAX(tx_params->max_idle_timeout, rx_params->max_idle_timeout); TRACE_PROTO("\nTX(remote) transp. params.", QUIC_EV_TRANSP_PARAMS, qc, tx_params); + /* Check that the "initial_source_connection_id" transport parameter matches + * the SCID received which is also the DCID of the connection. + */ + iscid = &tx_params->initial_source_connection_id; + if (qc->dcid.len != iscid->len || + (qc->dcid.len && memcmp(qc->dcid.data, iscid->data, qc->dcid.len))) { + TRACE_PROTO("initial_source_connection_id transport parameter mismatch", + QUIC_EV_TRANSP_PARAMS, qc); + /* Kill the connection as soon as possible */ + qc_kill_conn(qc); + } + return 1; } From fd6f24cd6390bd9c1b95eefa10f275efd92a1b9a Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Fri, 3 Feb 2023 18:39:06 +0100 Subject: [PATCH 024/140] BUG/MEDIUM: quic: do not split STREAM frames if no space When building STREAM frames in a packet buffer, if a frame is too large it will be splitted in two. A shorten version will be used and the original frame will be modified to represent the remaining space. To ensure there is enough space to store the frame data length encoded as a QUIC integer, we use the function max_available_room(). This function can return 0 if there not only a small space left which is insufficient for the frame header and the shorten data. Prior to this patch, this wasn't check and an empty unneeded STREAM frame was built and sent for nothing. Change this by checking the value return by max_available_room(). If 0, do not try to split this frame and continue to the next ones in the packet. On 2.6, this patch serves as an optimization which will prevent the building of unneeded empty STREAM frames. On 2.7, this behavior has the side-effect of triggering a BUG_ON() statement on quic_build_stream_frame(). This BUG_ON() ensures that we do not use quic_frame with OFF bit set if its offset is 0. This can happens if the condition defined above is reproduced for a STREAM frame at offset 0. An empty unneeded frame is built as descibed. The problem is that the original frame is modified with its OFF bit set even if the offset is still 0. This must be backported up to 2.6. (cherry picked from commit f2f08f88ef58bdf8b85e73596460af83825237cd) Signed-off-by: Willy Tarreau (cherry picked from commit a7c44c4d6e57068db7fb41643d4e241f91ea208e) Signed-off-by: Willy Tarreau --- src/quic_conn.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 3a1619b5f..26bd30760 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -6740,11 +6740,13 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, TRACE_DEVEL(" New STREAM frame build (room, len)", QUIC_EV_CONN_BCFRMS, qc, &room, len); + + /* hlen contains STREAM id and offset. Ensure there is + * enough room for length field. + */ if (cf->type & QUIC_STREAM_FRAME_TYPE_LEN_BIT) { - dlen = max_available_room(avail_room, &dlen_sz); - if (dlen > cf->stream.len) { - dlen = cf->stream.len; - } + dlen = QUIC_MIN((uint64_t)max_available_room(avail_room, &dlen_sz), + cf->stream.len); dlen_sz = quic_int_getsize(dlen); flen = hlen + dlen_sz + dlen; } @@ -6752,6 +6754,14 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, dlen = QUIC_MIN((uint64_t)avail_room, cf->stream.len); flen = hlen + dlen; } + + if (cf->stream.len && !dlen) { + /* Only a small gap is left on buffer, not + * enough to encode the STREAM data length. + */ + continue; + } + TRACE_DEVEL(" STREAM data length (hlen, stream.len, dlen)", QUIC_EV_CONN_BCFRMS, qc, &hlen, &cf->stream.len, &dlen); TRACE_DEVEL(" STREAM frame length (flen)", From 49a324dfd2a3ec08d8af8682c28fc7037ac0bdc6 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Tue, 7 Feb 2023 17:06:35 +0100 Subject: [PATCH 025/140] BUG/MINOR: ssl/crt-list: warn when a line is malformated Display a warning when some text exists between the filename and the options. This part is completely ignored so if there are filters here, they were never parsed. This could be backported in every versions. In the older versions, the parsing was done in ssl_sock_load_cert_list_file() in ssl_sock.c. (cherry picked from commit d85227fca20a5c793857c1632283ef4a2120285a) Signed-off-by: Willy Tarreau (cherry picked from commit e534f74a85d337c3957865f5abe54cacc0ed4154) Signed-off-by: Willy Tarreau --- src/ssl_crtlist.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/ssl_crtlist.c b/src/ssl_crtlist.c index a8cd24044..22fe54228 100644 --- a/src/ssl_crtlist.c +++ b/src/ssl_crtlist.c @@ -403,6 +403,11 @@ int crtlist_parse_line(char *line, char **crt_path, struct crtlist_entry *entry, *crt_path = args[0]; if (ssl_b) { + if (ssl_b > 1) { + memprintf(err, "parsing [%s:%d]: malformated line, filters can't be between filename and options!", file, linenum); + cfgerr |= ERR_WARN; + } + ssl_conf = calloc(1, sizeof *ssl_conf); if (!ssl_conf) { memprintf(err, "not enough memory!"); From 75cf533938a79c4365678cdcb0a9f2c4ebc182d9 Mon Sep 17 00:00:00 2001 From: Aleksey Ponomaryov Date: Tue, 7 Feb 2023 19:27:06 +0100 Subject: [PATCH 026/140] BUG/MEDIUM: stick-table: do not leave entries in end of window during purge At some moments expired stick table records stop being removed. This happens when the internal time wraps around the 32-bit limit, or every 49.7 days. What precisely happens is that some elements that are collected close to the end of the time window (2^32 - table's "expire" setting) might have been updated and will be requeued further, at the beginning of the next window. Here, three bad situations happen: - the incorrect integer-based comparison that is not aware of wrapping will result in the scan to restart from the freshly requeued element, skipping all those at the end of the window. The net effect of this is that at each wakeup of the expiration task, only one element from the end of the window will be expired, and other ones will remain there for a very long time, especially if they have to wait for all the predecessors to be picked one at a time after slow wakeups due to a long expiration ; this is what was observed in issue #2034 making the table fill up and appear as not expiring at all, and it seems that issue #2024 reports the same problem at the same moment (since such issues happen for everyone roughly at the same time when the clock doesn't drift too much). - the elements that were placed at the beginning of the next window are skipped as well for as long as there are refreshed entries at the end of the previous window, so these ones participate to filling the table as well. This is cause by the restart from the current, updated node that is generally placed after most other less recently updated elements. - once the last element at the end of the window is picked, suddenly there is a large amount of expired entries at the beginning of the next window that all have to be requeued. If the expiration delay is large, the number can be big and it can take a long time, which can very likely explain the periodic crashes reported in issue #2025. Limiting the batch size as done in commit dfe79251d ("BUG/MEDIUM: stick-table: limit the time spent purging old entries") would make sense for process_table_expire() as well. This patch addresses the incorrect tree scan algorithm to make sure that: - there's always a next element to compare against, even when dealing with the last one in the tree, the first one must be used ; - time comparisons used to decide whether to restart from the current element use tick_is_lt() as it is the only case where we know the current element will be placed before any other one (since the tree respects insertion ordering for duplicates) In order to reproduce the issue, it was found that injecting traffic on a random key that spans over half of the size of a table whose expiration is set to 15s while the date is going to wrap in 20s does exhibit an increase of the table's size 5s after startup, when entries start to be pushed to the next window. It's more effective when a second load generator constantly hammers a same key to be certain that none of them is ready to expire. This doesn't happen anymore after this patch. This fix needs to be backported to all stable versions. The bug has been there for as long as the stick tables were introduced in 1.4-dev7 with commit 3bd697e07 ("[MEDIUM] Add stick table (persistence) management functions and types"). A cleanup could consists in deduplicating that code by having process_table_expire() call __stktable_trash_oldest(), with that one improved to support an optional time check. (cherry picked from commit 593802128c332bd2b197ef14d2d7df87eab5443c) Signed-off-by: Willy Tarreau (cherry picked from commit a2997c154164dd11db481dc7d81453f9212af981) Signed-off-by: Willy Tarreau --- src/stick_table.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/stick_table.c b/src/stick_table.c index 6eea7cbbb..e0a2c93a8 100644 --- a/src/stick_table.c +++ b/src/stick_table.c @@ -224,7 +224,18 @@ int __stktable_trash_oldest(struct stktable *t, int to_batch) ts->exp.key = ts->expire; eb32_insert(&t->exps, &ts->exp); - if (!eb || eb->key > ts->exp.key) + /* the update might have jumped beyond the next element, + * possibly causing a wrapping. We need to check whether + * the next element should be used instead. If the next + * element doesn't exist it means we're on the right + * side and have to check the first one then. If it + * exists and is closer, we must use it, otherwise we + * use the current one. + */ + if (!eb) + eb = eb32_first(&t->exps); + + if (!eb || tick_is_lt(ts->exp.key, eb->key)) eb = &ts->exp; continue; @@ -607,7 +618,18 @@ struct task *process_table_expire(struct task *task, void *context, unsigned int ts->exp.key = ts->expire; eb32_insert(&t->exps, &ts->exp); - if (!eb || eb->key > ts->exp.key) + /* the update might have jumped beyond the next element, + * possibly causing a wrapping. We need to check whether + * the next element should be used instead. If the next + * element doesn't exist it means we're on the right + * side and have to check the first one then. If it + * exists and is closer, we must use it, otherwise we + * use the current one. + */ + if (!eb) + eb = eb32_first(&t->exps); + + if (!eb || tick_is_lt(ts->exp.key, eb->key)) eb = &ts->exp; continue; } From eb039d10920a631a2946a9c1562c997b2a84700f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 7 Feb 2023 15:22:41 +0100 Subject: [PATCH 027/140] BUG/MEDIUM: cache: use the correct time reference when comparing dates The cache makes use of dates advertised by external components, such as "last-modified" or "date". As such these are wall-clock dates, and not internal dates. However, all comparisons are mistakenly made based on the internal monotonic date which is designed to drift from the wall clock one in order to catch up with stolen time (which can sometimes be intense in VMs). As such after some run time some objects may fail to validate or fail to expire depending on the direction of the drift. This is particularly visible when applying an offset to the internal time to force it to wrap soon after startup, as it will be shifted up to 49.7 days in the future depending on the current date; what happens in this case is that the reg-test "cache_expires.vtc" fails on the 3rd test by returning stale contents from the cache at the date of this commit. It is really important that all external dates are compared against "date" and not "now" for this reason. This fix needs to be backported to all versions. (cherry picked from commit 9b5d57dfd5047b167896e08128c029dc7d3615b8) Signed-off-by: Willy Tarreau (cherry picked from commit c188486a2872a8fc6bd934b2fc1c421f068e18e0) Signed-off-by: Willy Tarreau --- src/cache.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/cache.c b/src/cache.c index 604d98d20..4992b5e22 100644 --- a/src/cache.c +++ b/src/cache.c @@ -155,7 +155,7 @@ struct cache_st { struct cache_entry { unsigned int complete; /* An entry won't be valid until complete is not null. */ unsigned int latest_validation; /* latest validation date */ - unsigned int expire; /* expiration date */ + unsigned int expire; /* expiration date (wall clock time) */ unsigned int age; /* Origin server "Age" header value */ struct eb32_node eb; /* ebtree node used to hold the cache object */ @@ -207,7 +207,7 @@ struct cache_entry *entry_exist(struct cache *cache, char *hash) if (memcmp(entry->hash, hash, sizeof(entry->hash))) return NULL; - if (entry->expire > now.tv_sec) { + if (entry->expire > date.tv_sec) { return entry; } else { delete_entry(entry); @@ -268,7 +268,7 @@ struct cache_entry *secondary_entry_exist(struct cache *cache, struct cache_entr * when we find them. Calling delete_entry would be too costly * so we simply call eb32_delete. The secondary_entry count will * be updated when we try to insert a new entry to this list. */ - if (entry->expire <= now.tv_sec) { + if (entry->expire <= date.tv_sec) { eb32_delete(&entry->eb); entry->eb.key = 0; } @@ -277,7 +277,7 @@ struct cache_entry *secondary_entry_exist(struct cache *cache, struct cache_entr } /* Expired entry */ - if (entry && entry->expire <= now.tv_sec) { + if (entry && entry->expire <= date.tv_sec) { eb32_delete(&entry->eb); entry->eb.key = 0; entry = NULL; @@ -302,7 +302,7 @@ static unsigned int clear_expired_duplicates(struct eb32_node **dup_tail) while (prev) { entry = container_of(prev, struct cache_entry, eb); prev = eb32_prev_dup(prev); - if (entry->expire <= now.tv_sec) { + if (entry->expire <= date.tv_sec) { eb32_delete(&entry->eb); entry->eb.key = 0; } @@ -334,7 +334,7 @@ static struct eb32_node *insert_entry(struct cache *cache, struct cache_entry *n struct eb32_node *prev = NULL; struct cache_entry *entry = NULL; unsigned int entry_count = 0; - unsigned int last_clear_ts = now.tv_sec; + unsigned int last_clear_ts = date.tv_sec; struct eb32_node *node = eb32_insert(&cache->entries, &new_entry->eb); @@ -357,7 +357,7 @@ static struct eb32_node *insert_entry(struct cache *cache, struct cache_entry *n * space. In order to avoid going over the same list too * often, we first check the timestamp of the last check * performed. */ - if (last_clear_ts == now.tv_sec) { + if (last_clear_ts == date.tv_sec) { /* Too many entries for this primary key, clear the * one that was inserted. */ eb32_delete(node); @@ -370,7 +370,7 @@ static struct eb32_node *insert_entry(struct cache *cache, struct cache_entry *n /* Still too many entries for this primary key, delete * the newly inserted one. */ entry = container_of(prev, struct cache_entry, eb); - entry->last_clear_ts = now.tv_sec; + entry->last_clear_ts = date.tv_sec; eb32_delete(node); node->key = 0; return NULL; @@ -829,8 +829,8 @@ int http_calc_maxage(struct stream *s, struct cache *cache, int *true_maxage) /* A request having an expiring date earlier * than the current date should be considered as * stale. */ - expires = (expires_val >= now.tv_sec) ? - (expires_val - now.tv_sec) : 0; + expires = (expires_val >= date.tv_sec) ? + (expires_val - date.tv_sec) : 0; } else { /* Following RFC 7234#5.3, an invalid date @@ -904,7 +904,7 @@ static time_t get_last_modified_time(struct htx *htx) /* Fallback on the current time if no "Last-Modified" or "Date" header * was found. */ if (!last_modified) - last_modified = now.tv_sec; + last_modified = date.tv_sec; return last_modified; } @@ -1138,7 +1138,7 @@ enum act_return http_action_store_cache(struct act_rule *rule, struct proxy *px, * is set by the end of this function (in case of concurrent accesses to * the same resource). This way the second access will find an existing * but not yet usable entry in the tree and will avoid storing its data. */ - object->expire = now.tv_sec + 2; + object->expire = date.tv_sec + 2; memcpy(object->hash, txn->cache_hash, sizeof(object->hash)); if (vary_signature) @@ -1242,8 +1242,8 @@ enum act_return http_action_store_cache(struct act_rule *rule, struct proxy *px, if (cache_ctx) { cache_ctx->first_block = first; /* store latest value and expiration time */ - object->latest_validation = now.tv_sec; - object->expire = now.tv_sec + effective_maxage; + object->latest_validation = date.tv_sec; + object->expire = date.tv_sec + effective_maxage; return ACT_RET_CONT; } @@ -1440,7 +1440,7 @@ static int htx_cache_add_age_hdr(struct appctx *appctx, struct htx *htx) char *end; chunk_reset(&trash); - age = MAX(0, (int)(now.tv_sec - cache_ptr->latest_validation)) + cache_ptr->age; + age = MAX(0, (int)(date.tv_sec - cache_ptr->latest_validation)) + cache_ptr->age; if (unlikely(age > CACHE_ENTRY_MAX_AGE)) age = CACHE_ENTRY_MAX_AGE; end = ultoa_o(age, b_head(&trash), b_size(&trash)); @@ -2629,13 +2629,13 @@ static int cli_io_handler_show_cache(struct appctx *appctx) entry = container_of(node, struct cache_entry, eb); next_key = node->key + 1; - if (entry->expire > now.tv_sec) { + if (entry->expire > date.tv_sec) { chunk_printf(&trash, "%p hash:%u vary:0x", entry, read_u32(entry->hash)); for (i = 0; i < HTTP_CACHE_SEC_KEY_LEN; ++i) chunk_appendf(&trash, "%02x", (unsigned char)entry->secondary_key[i]); chunk_appendf(&trash, " size:%u (%u blocks), refcount:%u, expire:%d\n", block_ptr(entry)->len, block_ptr(entry)->block_count, - block_ptr(entry)->refcount, entry->expire - (int)now.tv_sec); + block_ptr(entry)->refcount, entry->expire - (int)date.tv_sec); } else { /* time to remove that one */ delete_entry(entry); From 393d7d42843251cfc06b1e7a6335f70a5b4c5551 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 12 Jan 2023 15:06:11 +0100 Subject: [PATCH 028/140] DOC: config: fix option spop-check proxy compatibility The doc mentioned that spop-check option may only be used for backends. However, option may be used in default and listen sections as well according to the code. Let's fix the doc so that doc and code are consistent to each other. This could be backported to all stable versions. (cherry picked from commit f3a2ae7c6354bfbb1054a54d46658cf4410fd4bb) Signed-off-by: Willy Tarreau (cherry picked from commit f0c6f5ea7296c218ea4a1f28c684ad64de2f6bc8) Signed-off-by: Willy Tarreau --- doc/configuration.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index 2152d87bb..9d587c9be 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -4053,7 +4053,7 @@ option socket-stats (*) X X X - option splice-auto (*) X X X X option splice-request (*) X X X X option splice-response (*) X X X X -option spop-check - - - X +option spop-check X - X X option srvtcpka (*) X - X X option ssl-hello-chk X - X X -- keyword -------------------------- defaults - frontend - listen -- backend - @@ -9958,7 +9958,7 @@ no option splice-response option spop-check Use SPOP health checks for server testing May be used in sections : defaults | frontend | listen | backend - no | no | no | yes + yes | no | yes | yes Arguments : none It is possible to test that the server correctly talks SPOP protocol instead From 7c1fbab0e12756e6a6d1a519c4a0281004bbc9a9 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 12 Jan 2023 15:59:27 +0100 Subject: [PATCH 029/140] DOC: config: 'http-send-name-header' option may be used in default section Both doc and code agree on the fact that 'http-send-name-header' option could be used in default section, but the keyword compatibility matrix in configuration.txt reported the opposite. This could be backported to all stable versions. (cherry picked from commit df238c34c2f82e4a999dab1021608e2aecf80e8f) Signed-off-by: Willy Tarreau (cherry picked from commit bb4bacf32db7564e668ccc3aacc6bed8688894ba) Signed-off-by: Willy Tarreau --- doc/configuration.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index 9d587c9be..cdbcc948e 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -3995,7 +3995,7 @@ http-error X X X X http-request X (!) X X X http-response X (!) X X X http-reuse X - X X -http-send-name-header - - X X +http-send-name-header X - X X id - X X X ignore-persist - - X X load-server-state-from-file X - X X From ce73eb009da52a4ba877fcf88c3045d3d909b119 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Wed, 8 Feb 2023 11:49:02 +0100 Subject: [PATCH 030/140] MINOR: cfgparse/server: move (min/max)conn postparsing logic into dedicated function In check_config_validity() function, we performed some consistency checks to adjust minconn/maxconn attributes for each declared server. We move this logic into a dedicated function named srv_minmax_conn_apply() to be able to perform those checks later in the process life when needed (ie: dynamic servers) (cherry picked from commit 3e7a0bb70b2f1a81d17163f27132f2f44b71521e) [wt: needed for next fix] Signed-off-by: Willy Tarreau (cherry picked from commit 5bda7da2467095177468e0af211d36673281de72) Signed-off-by: Willy Tarreau --- include/haproxy/server.h | 20 ++++++++++++++++++++ src/cfgparse.c | 11 +---------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/haproxy/server.h b/include/haproxy/server.h index 85cf98f4e..76023b66b 100644 --- a/include/haproxy/server.h +++ b/include/haproxy/server.h @@ -281,6 +281,26 @@ static inline void srv_use_conn(struct server *srv, struct connection *conn) HA_ATOMIC_STORE(&srv->est_need_conns, curr); } +/* checks if minconn and maxconn are consistent to each other + * and automatically adjust them if it is not the case + * This logic was historically implemented in check_config_validity() + * at boot time, but with the introduction of dynamic servers + * this may be used at multiple places in the code now + */ +static inline void srv_minmax_conn_apply(struct server *srv) +{ + if (srv->minconn > srv->maxconn) { + /* Only 'minconn' was specified, or it was higher than or equal + * to 'maxconn'. Let's turn this into maxconn and clean it, as + * this will avoid further useless expensive computations. + */ + srv->maxconn = srv->minconn; + } else if (srv->maxconn && !srv->minconn) { + /* minconn was not specified, so we set it to maxconn */ + srv->minconn = srv->maxconn; + } +} + #endif /* _HAPROXY_SERVER_H */ /* diff --git a/src/cfgparse.c b/src/cfgparse.c index 9ebf96cd6..da6127575 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -3501,16 +3501,7 @@ out_uri_auth_compat: while (newsrv != NULL) { set_usermsgs_ctx(newsrv->conf.file, newsrv->conf.line, &newsrv->obj_type); - if (newsrv->minconn > newsrv->maxconn) { - /* Only 'minconn' was specified, or it was higher than or equal - * to 'maxconn'. Let's turn this into maxconn and clean it, as - * this will avoid further useless expensive computations. - */ - newsrv->maxconn = newsrv->minconn; - } else if (newsrv->maxconn && !newsrv->minconn) { - /* minconn was not specified, so we set it to maxconn */ - newsrv->minconn = newsrv->maxconn; - } + srv_minmax_conn_apply(newsrv); /* this will also properly set the transport layer for * prod and checks From 0a7750152e8f4801abd9a5b28e8a620c082bd11f Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Wed, 8 Feb 2023 11:55:08 +0100 Subject: [PATCH 031/140] BUG/MINOR: server/add: ensure minconn/maxconn consistency when adding server When a new server was added through the cli using "server add" command, the maxconn/minconn consistency check historically implemented in check_config_validity() for static servers was missing. As a result, when adding a server with the maxconn parameter without the minconn set, the server was unable to handle any connection because srv_dynamic_maxconn() would always return 0. Consider the following reproducer: | global | stats socket /tmp/ha.sock mode 660 level admin expose-fd listeners | | defaults | timeout client 5s | timeout server 5s | timeout connect 5s | | frontend test | mode http | bind *:8081 | use_backend farm | | listen dummyok | bind localhost:18999 | mode http | http-request return status 200 hdr test "ok" | | backend farm | mode http Start haproxy and perform the following : echo "add server farm/t1 127.0.0.1:18999 maxconn 100" | nc -U /tmp/ha.sock echo "enable server farm/t1" | nc -U /tmp/ha.sock curl localhost:8081 # -> 503 after 5s connect timeout Thanks to ("MINOR: cfgparse/server: move (min/max)conn postparsing logic into dedicated function"), we are now able to perform the consistency check after the new dynamic server has been parsed. This is enough to fix the issue documented here that was reported by Thomas Pedoussaut on the ML. This commit depends on: - ("MINOR: cfgparse/server: move (min/max)conn postparsing logic into dedicated function") It must be backported to 2.6 and 2.7 (cherry picked from commit 86207e782cea35c4f85657855ed47ab3295e2695) Signed-off-by: Willy Tarreau (cherry picked from commit d736eedbcc68e49182096f23403222ea44b8d03b) Signed-off-by: Willy Tarreau --- src/server.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/server.c b/src/server.c index 8f9ba6c6b..cccf2f5a5 100644 --- a/src/server.c +++ b/src/server.c @@ -4758,6 +4758,9 @@ static int cli_parse_add_server(char **args, char *payload, struct appctx *appct goto out; } + /* ensure minconn/maxconn consistency */ + srv_minmax_conn_apply(srv); + if (srv->use_ssl == 1 || (srv->proxy->options & PR_O_TCPCHK_SSL) || srv->check.use_ssl == 1) { if (xprt_get(XPRT_SSL) && xprt_get(XPRT_SSL)->prepare_srv) { From e537945e72d08425ba42860b356621704d0a77e3 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Wed, 8 Feb 2023 16:18:48 +0100 Subject: [PATCH 032/140] BUG/MEDIUM: stconn: Schedule a shutw on shutr if data must be sent first The commit 7f59d68fe ("BUG/MEDIIM: stconn: Flush output data before forwarding close to write side") introduced a regression. When the read side is closed, the close is not forwarded to the write side if there are some pending outgoind data. The idea is to foward data first and the close the write side. However, when fast-forwarding is enabled and last data block is received with the read0, the close is never forwarded. We cannot revert the commit above because it really fix an issue. However, we can schedule the shutdown for write by setting CF_SHUTW_NOW flag on the write side. Indeed, it is the purpose of this flag. To not replicate ugly and hardly maintainable code block at different places in stconn.c, an helper function is used. Thus, sc_cond_forward_shutw() must be called to know if the close can be fowarded or not. It returns 1 if it is possible. In this case, the caller is responsible to forward the close to the write side. Otherwise, if the close cannot be forwarded, 0 is returned. It happens when it should not be performed at all. Or when it should only be delayed, waiting for the input channel to be flushed. In this last case, the CF_SHUTW_NOW flag is set in the output channel. This patch should fix the issue #2033. It must be backported with the commit above, thus at least as far as 2.2. (cherry picked from commit eb3f26d5a023431d28107d29a60f0e923dcfc85e) Signed-off-by: Willy Tarreau (cherry picked from commit 8c5ac22886b7786234c13c4701c8b0f5503ae43f) Signed-off-by: Willy Tarreau --- src/stconn.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/stconn.c b/src/stconn.c index 677fb4e4a..0dd110ea6 100644 --- a/src/stconn.c +++ b/src/stconn.c @@ -498,6 +498,30 @@ struct appctx *sc_applet_create(struct stconn *sc, struct applet *app) return appctx; } +/* Conditionnaly forward the close to the wirte side. It return 1 if it can be + * forwarded. It is the caller responsibility to forward the close to the write + * side. Otherwise, 0 is returned. In this case, CF_SHUTW_NOW flag may be set on + * the channel if we are only waiting for the outgoing data to be flushed. + */ +static inline int sc_cond_forward_shutw(struct stconn *sc) +{ + /* The close must not be forwarded */ + if (!(sc_ic(sc)->flags & CF_SHUTR) || !(sc->flags & SC_FL_NOHALF)) + return 0; + + if (!channel_is_empty(sc_ic(sc))) { + /* the close to the write side cannot be forwarded now because + * we should flush outgoing data first. But instruct the output + * channel it should be done ASAP. + */ + channel_shutw_now(sc_oc(sc)); + return 0; + } + + /* the close can be immediately forwarded to the write side */ + return 1; +} + /* * This function performs a shutdown-read on a detached stream connector in a * connected or init state (it does nothing for other states). It either shuts @@ -522,10 +546,8 @@ static void sc_app_shutr(struct stconn *sc) if (sc->flags & SC_FL_ISBACK) __sc_strm(sc)->conn_exp = TICK_ETERNITY; } - else if ((sc->flags & SC_FL_NOHALF) && channel_is_empty(ic)) { - /* we want to immediately forward this close to the write side */ + else if (sc_cond_forward_shutw(sc)) return sc_app_shutw(sc); - } /* note that if the task exists, it must unregister itself once it runs */ if (!(sc->flags & SC_FL_DONT_WAKE)) @@ -666,10 +688,8 @@ static void sc_app_shutr_conn(struct stconn *sc) if (sc->flags & SC_FL_ISBACK) __sc_strm(sc)->conn_exp = TICK_ETERNITY; } - else if ((sc->flags & SC_FL_NOHALF) && channel_is_empty(ic)) { - /* we want to immediately forward this close to the write side */ + else if (sc_cond_forward_shutw(sc)) return sc_app_shutw_conn(sc); - } } /* @@ -894,10 +914,8 @@ static void sc_app_shutr_applet(struct stconn *sc) if (sc->flags & SC_FL_ISBACK) __sc_strm(sc)->conn_exp = TICK_ETERNITY; } - else if ((sc->flags & SC_FL_NOHALF) && channel_is_empty(ic)) { - /* we want to immediately forward this close to the write side */ + else if (sc_cond_forward_shutw(sc)) return sc_app_shutw_applet(sc); - } } /* @@ -1253,7 +1271,7 @@ static void sc_conn_read0(struct stconn *sc) if (oc->flags & CF_SHUTW) goto do_close; - if ((sc->flags & SC_FL_NOHALF) && channel_is_empty(ic)) { + if (sc_cond_forward_shutw(sc)) { /* we want to immediately forward this close to the write side */ /* force flag on ssl to keep stream in cache */ sc_conn_shutw(sc, CO_SHW_SILENT); From 3313d8de74c73602751b3de9312885cabd56b237 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 9 Feb 2023 17:53:41 +0100 Subject: [PATCH 033/140] BUG/MEDIUM: quic: fix crash when "option nolinger" is set in the frontend Commit 0aba11e9e ("MINOR: quic: remove unnecessary quic_session_accept()") overlooked one problem, in session_accept_fd() at the end, there's a bunch of FD-specific stuff that either sets up or resets the socket at the TCP level. The tests are mostly performed for AF_INET/AF_INET6 families but they're only for one part (i.e. to avoid setting up TCP options on UNIX sockets). Other pieces continue to configure the socket regardless of its family. All of this directly acts on the FD, which is not correct since the FD is not valid here, it corresponds to the QUIC handle. The issue is much more visible when "option nolinger" is enabled in the frontend, because the access to fdatb[cfd].state immediately crashes on the first connection, as can be seen in github issue #2030. This patch bypasses this setup for FD-less connections, such as QUIC. However some of them could definitely be relevant to the QUIC stack, or even to UNIX sockets sometimes. A better long-term solution would consist in implementing a setsockopt() equivalent at the protocol layer that would be used to configure the socket, either the FD or the QUIC conn depending on the case. Some of them would not always be implemented but that would allow to unify all this code. This fix must be backported everywhere the commit above is backported, namely 2.6 and 2.7. Thanks to github user @twomoses for the nicely detailed report. (cherry picked from commit db991c2658e5b35dee0a18512f86ba107d724136) Signed-off-by: Willy Tarreau (cherry picked from commit 1dc3ef243744f90d06e5db4ccd779c1411298968) Signed-off-by: Willy Tarreau --- src/session.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/session.c b/src/session.c index 3cbacc7ae..66120d71d 100644 --- a/src/session.c +++ b/src/session.c @@ -182,7 +182,8 @@ int session_accept_fd(struct connection *cli_conn) */ if ((l->options & LI_O_TCP_L4_RULES) && !tcp_exec_l4_rules(sess)) { /* let's do a no-linger now to close with a single RST. */ - setsockopt(cfd, SOL_SOCKET, SO_LINGER, (struct linger *) &nolinger, sizeof(struct linger)); + if (!(cli_conn->flags & CO_FL_FDLESS)) + setsockopt(cfd, SOL_SOCKET, SO_LINGER, (struct linger *) &nolinger, sizeof(struct linger)); ret = 0; /* successful termination */ goto out_free_sess; } @@ -190,6 +191,12 @@ int session_accept_fd(struct connection *cli_conn) if (conn_xprt_start(cli_conn) < 0) goto out_free_sess; + /* FIXME/WTA: we should implement the setsockopt() calls at the proto + * level instead and let non-inet protocols implement their own equivalent. + */ + if (cli_conn->flags & CO_FL_FDLESS) + goto skip_fd_setup; + /* Adjust some socket options */ if (l->rx.addr.ss_family == AF_INET || l->rx.addr.ss_family == AF_INET6) { setsockopt(cfd, IPPROTO_TCP, TCP_NODELAY, (char *) &one, sizeof(one)); @@ -235,6 +242,7 @@ int session_accept_fd(struct connection *cli_conn) if (global.tune.client_rcvbuf) setsockopt(cfd, SOL_SOCKET, SO_RCVBUF, &global.tune.client_rcvbuf, sizeof(global.tune.client_rcvbuf)); + skip_fd_setup: /* OK, now either we have a pending handshake to execute with and then * we must return to the I/O layer, or we can proceed with the end of * the stream initialization. In case of handshake, we also set the I/O @@ -282,7 +290,8 @@ int session_accept_fd(struct connection *cli_conn) out_free_conn: if (ret < 0 && l->bind_conf->xprt == xprt_get(XPRT_RAW) && - p->mode == PR_MODE_HTTP && l->bind_conf->mux_proto == NULL) { + p->mode == PR_MODE_HTTP && l->bind_conf->mux_proto == NULL && + !(cli_conn->flags & CO_FL_FDLESS)) { /* critical error, no more memory, try to emit a 500 response */ send(cfd, http_err_msgs[HTTP_ERR_500], strlen(http_err_msgs[HTTP_ERR_500]), MSG_DONTWAIT|MSG_NOSIGNAL); From 663e673a3c9d51aaa5a47e9a39b754616c7c7219 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 12 Feb 2023 09:26:48 +0100 Subject: [PATCH 034/140] DOC: proxy-protocol: fix wrong byte in provided example There was a mistake in the example of proxy-proto frame provided, it cannot end with 0x02 but only 0x20 or 0x21 since the version is in the upper 4 bits and the lower ones are 0 for LOCAL or 1 for PROXY, hence the example should be: \x0D\x0A\x0D\x0A\x00\x0D\x0A\x51\x55\x49\x54\x0A\x20 Thanks to Bram Grit for reporting this mistake. (cherry picked from commit e008402972e58ab99506bbbbf4d540a21f5be465) Signed-off-by: Willy Tarreau (cherry picked from commit 687176f6d661e7882fd3b76427f2fe9a53e7bc1e) Signed-off-by: Willy Tarreau --- doc/proxy-protocol.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/proxy-protocol.txt b/doc/proxy-protocol.txt index 4d49d5cd1..fac033176 100644 --- a/doc/proxy-protocol.txt +++ b/doc/proxy-protocol.txt @@ -500,7 +500,7 @@ protocol. Identifying the protocol version is easy : - if the incoming byte count is 16 or above and the 13 first bytes match the protocol signature block followed by the protocol version 2 : - \x0D\x0A\x0D\x0A\x00\x0D\x0A\x51\x55\x49\x54\x0A\x02 + \x0D\x0A\x0D\x0A\x00\x0D\x0A\x51\x55\x49\x54\x0A\x20 - otherwise, if the incoming byte count is 8 or above, and the 5 first characters match the US-ASCII representation of "PROXY" then the protocol From 73be199c4f5f1ed468161a4c5e10ca77cd5989d8 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 9 Feb 2023 21:36:54 +0100 Subject: [PATCH 035/140] BUG/CRITICAL: http: properly reject empty http header field names The HTTP header parsers surprizingly accepts empty header field names, and this is a leftover from the original code that was agnostic to this. When muxes were introduced, for H2 first, the HPACK decompressor needed to feed headers lists, and since empty header names were strictly forbidden by the protocol, the lists of headers were purposely designed to be terminated by an empty header field name (a principle that is similar to H1's empty line termination). This principle was preserved and generalized to other protocols migrated to muxes (H1/FCGI/H3 etc) without anyone ever noticing that the H1 parser was still able to deliver empty header field names to this list. In addition to this it turns out that the HPACK decompressor, despite a comment in the code, may successfully decompress an empty header field name, and this mistake was propagated to the QPACK decompressor as well. The impact is that an empty header field name may be used to truncate the list of headers and thus make some headers disappear. While for H2/H3 the impact is limited as haproxy sees a request with missing headers, and headers are not used to delimit messages, in the case of HTTP/1, the impact is significant because the presence (and sometimes contents) of certain sensitive headers is detected during the parsing. Thus, some of these headers may be seen, marked as present, their value extracted, but never delivered to upper layers and obviously not forwarded to the other side either. This can have for consequence that certain important header fields such as Connection, Upgrade, Host, Content-length, Transfer-Encoding etc are possibly seen as different between what haproxy uses to parse/forward/route and what is observed in http-request rules and of course, forwarded. One direct consequence is that it is possible to exploit this property in HTTP/1 to make affected versions of haproxy forward more data than is advertised on the other side, and bypass some access controls or routing rules by crafting extraneous requests. Note, however, that responses to such requests will normally not be passed back to the client, but this can still cause some harm. This specific risk can be mostly worked around in configuration using the following rule that will rely on the bug's impact to precisely detect the inconsistency between the known body size and the one expected to be advertised to the server (the rule works from 2.0 to 2.8-dev): http-request deny if { fc_http_major 1 } !{ req.body_size 0 } !{ req.hdr(content-length) -m found } !{ req.hdr(transfer-encoding) -m found } !{ method CONNECT } This will exclusively block such carefully crafted requests delivered over HTTP/1. HTTP/2 and HTTP/3 do not need content-length, and a body that arrives without being announced with a content-length will be forwarded using transfer-encoding, hence will not cause discrepancies. In HAProxy 2.0 in legacy mode ("no option http-use-htx"), this rule will simply have no effect but will not cause trouble either. A clean solution would consist in modifying the loops iterating over these headers lists to check the header name's pointer instead of its length (since both are zero at the end of the list), but this requires to touch tens of places and it's very easy to miss one. Functions such as htx_add_header(), htx_add_trailer(), htx_add_all_headers() would be good starting points for such a possible future change. Instead the current fix focuses on blocking empty headers where they are first inserted, hence in the H1/HPACK/QPACK decoders. One benefit of the current solution (for H1) is that it allows "show errors" to report a precise diagnostic when facing such invalid HTTP/1 requests, with the exact location of the problem and the originating address: $ printf "GET / HTTP/1.1\r\nHost: localhost\r\n:empty header\r\n\r\n" | nc 0 8001 HTTP/1.1 400 Bad request Content-length: 90 Cache-Control: no-cache Connection: close Content-Type: text/html

400 Bad request

Your browser sent an invalid request. $ socat /var/run/haproxy.stat <<< "show errors" Total events captured on [10/Feb/2023:16:29:37.530] : 1 [10/Feb/2023:16:29:34.155] frontend decrypt (#2): invalid request backend (#-1), server (#-1), event #0, src 127.0.0.1:31092 buffer starts at 0 (including 0 out), 16334 free, len 50, wraps at 16336, error at position 33 H1 connection flags 0x00000000, H1 stream flags 0x00000810 H1 msg state MSG_HDR_NAME(17), H1 msg flags 0x00001410 H1 chunk len 0 bytes, H1 body len 0 bytes : 00000 GET / HTTP/1.1\r\n 00016 Host: localhost\r\n 00033 :empty header\r\n 00048 \r\n I want to address sincere and warm thanks for their great work to the team composed of the following security researchers who found the issue together and reported it: Bahruz Jabiyev, Anthony Gavazzi, and Engin Kirda from Northeastern University, Kaan Onarlioglu from Akamai Technologies, Adi Peleg and Harvey Tuch from Google. And kudos to Amaury Denoyelle from HAProxy Technologies for spotting that the HPACK and QPACK decoders would let this pass despite the comment explicitly saying otherwise. This fix must be backported as far as 2.0. The QPACK changes can be dropped before 2.6. In 2.0 there is also the equivalent code for legacy mode, which doesn't suffer from the list truncation, but it would better be fixed regardless. CVE-2023-25725 was assigned to this issue. (cherry picked from commit a8598a2eb11b6c989e81f0dbf10be361782e8d32) Signed-off-by: Willy Tarreau (cherry picked from commit a0e561ad7f29ed50c473f5a9da664267b60d1112) Signed-off-by: Willy Tarreau --- src/h1.c | 4 ++++ src/hpack-dec.c | 9 +++++++++ src/qpack-dec.c | 9 +++++++++ 3 files changed, 22 insertions(+) diff --git a/src/h1.c b/src/h1.c index 3330a5fcb..88a54c4a5 100644 --- a/src/h1.c +++ b/src/h1.c @@ -834,6 +834,10 @@ int h1_headers_to_hdr_list(char *start, const char *stop, if (likely(*ptr == ':')) { col = ptr - start; + if (col <= sol) { + state = H1_MSG_HDR_NAME; + goto http_msg_invalid; + } EAT_AND_JUMP_OR_RETURN(ptr, end, http_msg_hdr_l1_sp, http_msg_ood, state, H1_MSG_HDR_L1_SP); } diff --git a/src/hpack-dec.c b/src/hpack-dec.c index 147021cc3..052a7c3da 100644 --- a/src/hpack-dec.c +++ b/src/hpack-dec.c @@ -420,6 +420,15 @@ int hpack_decode_frame(struct hpack_dht *dht, const uint8_t *raw, uint32_t len, /* and are correctly filled here */ } + /* We must not accept empty header names (forbidden by the spec and used + * as a list termination). + */ + if (!name.len) { + hpack_debug_printf("##ERR@%d##\n", __LINE__); + ret = -HPACK_ERR_INVALID_ARGUMENT; + goto leave; + } + /* here's what we have here : * - name.len > 0 * - value is filled with either const data or data allocated from tmp diff --git a/src/qpack-dec.c b/src/qpack-dec.c index 0da6cf89a..2d8115645 100644 --- a/src/qpack-dec.c +++ b/src/qpack-dec.c @@ -531,6 +531,15 @@ int qpack_decode_fs(const unsigned char *raw, uint64_t len, struct buffer *tmp, len -= value_len; } + /* We must not accept empty header names (forbidden by the spec and used + * as a list termination). + */ + if (!name.len) { + qpack_debug_printf(stderr, "##ERR@%d\n", __LINE__); + ret = -QPACK_DECOMPRESSION_FAILED; + goto out; + } + list[hdr_idx].n = name; list[hdr_idx].v = value; ++hdr_idx; From 3a3700aa04c2ec35262924d625ce4ac804cd6ee5 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 14 Feb 2023 16:56:25 +0100 Subject: [PATCH 036/140] [RELEASE] Released version 2.6.9 Released version 2.6.9 with the following main changes : - BUG/MINOR: sink: make sure to always properly unmap a file-backed ring - DEV: haring: add a new option "-r" to automatically repair broken files - BUG/MINOR: log: release global log servers on exit - BUG/MINOR: sink: free the forwarding task on exit - DEV: hpack: fix `trash` build regression - BUG/MINOR: fcgi-app: prevent 'use-fcgi-app' in default section - MINOR: mux-quic/h3: define stream close callback - BUG/MEDIUM: h3: handle STOP_SENDING on control stream - BUG/MEDIUM: ssl: wrong eviction from the session cache tree - BUG/MINOR: h3: fix crash due to h3 traces - BUG/MINOR: stats: use proper buffer size for http dump - BUG/MINOR: stats: fix source buffer size for http dump - BUG/MEDIUM: stats: fix resolvers dump - BUG/MINOR: stats: fix ctx->field update in stats_dump_proxy_to_buffer() - BUG/MINOR: stats: fix show stats field ctx for servers - BUG/MINOR: stats: fix STAT_STARTED behavior with full htx - BUG/MINOR: quic: Possible stream truncations under heavy loss - BUG/MINOR: quic: Too big PTO during handshakes - BUG/MINOR: quic: Do not ignore coalesced packets in qc_prep_fast_retrans() - MINOR: quic: When probing Handshake packet number space, also probe the Initial one - BUG/MAJOR: quic: Possible crash when processing 1-RTT during 0-RTT session - MEDIUM: quic: Remove qc_conn_finalize() from the ClientHello TLS callbacks - BUG/MINOR: quic: Unchecked source connection ID - BUG/MEDIUM: quic: do not split STREAM frames if no space - BUG/MINOR: ssl/crt-list: warn when a line is malformated - BUG/MEDIUM: stick-table: do not leave entries in end of window during purge - BUG/MEDIUM: cache: use the correct time reference when comparing dates - DOC: config: fix option spop-check proxy compatibility - DOC: config: 'http-send-name-header' option may be used in default section - MINOR: cfgparse/server: move (min/max)conn postparsing logic into dedicated function - BUG/MINOR: server/add: ensure minconn/maxconn consistency when adding server - BUG/MEDIUM: stconn: Schedule a shutw on shutr if data must be sent first - BUG/MEDIUM: quic: fix crash when "option nolinger" is set in the frontend - DOC: proxy-protocol: fix wrong byte in provided example - BUG/CRITICAL: http: properly reject empty http header field names --- CHANGELOG | 37 +++++++++++++++++++++++++++++++++++++ VERDATE | 2 +- VERSION | 2 +- doc/configuration.txt | 2 +- 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8a603ed4d..362b9fcfd 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,43 @@ ChangeLog : =========== +2023/02/14 : 2.6.9 + - BUG/MINOR: sink: make sure to always properly unmap a file-backed ring + - DEV: haring: add a new option "-r" to automatically repair broken files + - BUG/MINOR: log: release global log servers on exit + - BUG/MINOR: sink: free the forwarding task on exit + - DEV: hpack: fix `trash` build regression + - BUG/MINOR: fcgi-app: prevent 'use-fcgi-app' in default section + - MINOR: mux-quic/h3: define stream close callback + - BUG/MEDIUM: h3: handle STOP_SENDING on control stream + - BUG/MEDIUM: ssl: wrong eviction from the session cache tree + - BUG/MINOR: h3: fix crash due to h3 traces + - BUG/MINOR: stats: use proper buffer size for http dump + - BUG/MINOR: stats: fix source buffer size for http dump + - BUG/MEDIUM: stats: fix resolvers dump + - BUG/MINOR: stats: fix ctx->field update in stats_dump_proxy_to_buffer() + - BUG/MINOR: stats: fix show stats field ctx for servers + - BUG/MINOR: stats: fix STAT_STARTED behavior with full htx + - BUG/MINOR: quic: Possible stream truncations under heavy loss + - BUG/MINOR: quic: Too big PTO during handshakes + - BUG/MINOR: quic: Do not ignore coalesced packets in qc_prep_fast_retrans() + - MINOR: quic: When probing Handshake packet number space, also probe the Initial one + - BUG/MAJOR: quic: Possible crash when processing 1-RTT during 0-RTT session + - MEDIUM: quic: Remove qc_conn_finalize() from the ClientHello TLS callbacks + - BUG/MINOR: quic: Unchecked source connection ID + - BUG/MEDIUM: quic: do not split STREAM frames if no space + - BUG/MINOR: ssl/crt-list: warn when a line is malformated + - BUG/MEDIUM: stick-table: do not leave entries in end of window during purge + - BUG/MEDIUM: cache: use the correct time reference when comparing dates + - DOC: config: fix option spop-check proxy compatibility + - DOC: config: 'http-send-name-header' option may be used in default section + - MINOR: cfgparse/server: move (min/max)conn postparsing logic into dedicated function + - BUG/MINOR: server/add: ensure minconn/maxconn consistency when adding server + - BUG/MEDIUM: stconn: Schedule a shutw on shutr if data must be sent first + - BUG/MEDIUM: quic: fix crash when "option nolinger" is set in the frontend + - DOC: proxy-protocol: fix wrong byte in provided example + - BUG/CRITICAL: http: properly reject empty http header field names + 2023/01/24 : 2.6.8 - BUG/MINOR: http-htx: Don't consider an URI as normalized after a set-uri action - BUG/MEDIIM: stconn: Flush output data before forwarding close to write side diff --git a/VERDATE b/VERDATE index bdf0ce5c8..b0f52f409 100644 --- a/VERDATE +++ b/VERDATE @@ -1,2 +1,2 @@ $Format:%ci$ -2023/01/24 +2023/02/14 diff --git a/VERSION b/VERSION index 743af5e12..d48d3702a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.8 +2.6.9 diff --git a/doc/configuration.txt b/doc/configuration.txt index cdbcc948e..25b537686 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -3,7 +3,7 @@ Configuration Manual ---------------------- version 2.6 - 2023/01/24 + 2023/02/14 This document covers the configuration language as implemented in the version From 0ffcf775a55c01cb97888df04a17b7d4133e10d8 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Tue, 21 Feb 2023 12:44:56 +0100 Subject: [PATCH 037/140] BUG/MINOR: mworker: stop doing strtok directly from the env When parsing the HAPROXY_PROCESSES environement variable, strtok was done directly from the ptr resulting from getenv(), which replaces the ; by \0, showing confusing environment variables when debugging in /proc or in a corefile. Example: (gdb) x/39s *environ [...] 0x7fff6935af64: "HAPROXY_PROCESSES=|type=w" 0x7fff6935af7e: "fd=3" 0x7fff6935af83: "pid=4444" 0x7fff6935af8d: "rpid=1" 0x7fff6935af94: "reloads=0" 0x7fff6935af9e: "timestamp=1676338060" 0x7fff6935afb3: "id=" 0x7fff6935afb7: "version=2.4.0-8076da-1010+11" This patch fixes the issue by doing a strdup on the variable. Could be backported in previous versions (mworker_proc_to_env_list exists since 1.9) (cherry picked from commit d27f457eea470117c608bf9d1a3bd42bcdb8e5dd) Signed-off-by: William Lallemand (cherry picked from commit 607c038872777fa597613b557023b1ded45874e9) Signed-off-by: William Lallemand --- src/mworker.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/mworker.c b/src/mworker.c index d8ddb78cc..12218ab42 100644 --- a/src/mworker.c +++ b/src/mworker.c @@ -161,17 +161,26 @@ struct mworker_proc *mworker_proc_new() /* * unserialize the proc list from the environment + * Return < 0 upon error. */ int mworker_env_to_proc_list() { - char *msg, *token = NULL, *s1; + char *env, *msg, *omsg = NULL, *token = NULL, *s1; struct mworker_proc *child; int minreloads = INT_MAX; /* minimum number of reloads to chose which processes are "current" ones */ + int err = 0; - msg = getenv("HAPROXY_PROCESSES"); - if (!msg) + env = getenv("HAPROXY_PROCESSES"); + if (!env) return 0; + omsg = msg = strdup(env); + if (!msg) { + ha_alert("Out of memory while trying to allocate a worker process structure."); + err = -1; + goto out; + } + while ((token = strtok_r(msg, "|", &s1))) { char *subtoken = NULL; char *s2; @@ -180,8 +189,9 @@ int mworker_env_to_proc_list() child = mworker_proc_new(); if (!child) { - ha_alert("Out of memory while trying to allocate a worker process structure."); - return -1; + ha_alert("out of memory while trying to allocate a worker process structure."); + err = -1; + goto out; } while ((subtoken = strtok_r(token, ";", &s2))) { @@ -237,7 +247,9 @@ int mworker_env_to_proc_list() unsetenv("HAPROXY_PROCESSES"); - return 0; +out: + free(omsg); + return err; } /* Signal blocking and unblocking */ From 934c7ea7ce4138192624f463dc7bb31a71fdd865 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Tue, 21 Feb 2023 13:17:24 +0100 Subject: [PATCH 038/140] BUG/MEDIUM: mworker: prevent inconsistent reload when upgrading from old versions Previous versions ( < 1.9 ) of the master-worker process didn't had the "HAPROXY_PROCESSES" environment variable which contains the list of processes, fd etc. The part which describes the master is created at first startup so if you started the master with an old version you would never have it. Since patch 68836740 ("MINOR: mworker: implement a reload failure counter"), the failedreloads member of the proc_self structure for the master is set to 0. However if this structure does not exist, it will result in a NULL dereference and crash the master. This patch fixes the issue by creating the proc_self structure for the master when it does not exist. It also shows a warning which states to restart the master if that is the case, because we can't guarantee that it will be working correctly. This MUST be backported as far as 2.5, and could be backported in every other stable branches. (cherry picked from commit e16d32050e0c91466c2466b93c177f38f66698e2) Signed-off-by: William Lallemand (cherry picked from commit 8648bf0d41b129cad2c42bdb807af6e8beef3a27) Signed-off-by: William Lallemand --- src/mworker.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/mworker.c b/src/mworker.c index 12218ab42..65eb89921 100644 --- a/src/mworker.c +++ b/src/mworker.c @@ -172,7 +172,7 @@ int mworker_env_to_proc_list() env = getenv("HAPROXY_PROCESSES"); if (!env) - return 0; + goto no_env; omsg = msg = strdup(env); if (!msg) { @@ -247,6 +247,24 @@ int mworker_env_to_proc_list() unsetenv("HAPROXY_PROCESSES"); +no_env: + + if (!proc_self) { + + proc_self = mworker_proc_new(); + if (!proc_self) { + ha_alert("Cannot allocate process structures.\n"); + err = -1; + goto out; + } + proc_self->options |= PROC_O_TYPE_MASTER; + proc_self->pid = pid; + proc_self->timestamp = 0; /* we don't know the startime anymore */ + + LIST_APPEND(&proc_list, &proc_self->list); + ha_warning("The master internals are corrupted or it was started with a too old version (< 1.9). Please restart the master process.\n"); + } + out: free(omsg); return err; From 2f874eaab7c4b4dbe3b485d05f3e46887d72c7c1 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Tue, 21 Feb 2023 13:41:24 +0100 Subject: [PATCH 039/140] BUG/MEDIUM: mworker: don't register mworker_accept_wrapper() when master FD is wrong This patch handles the case where the fd could be -1 when proc_self was lost for some reason (environment variable corrupted or upgrade from < 1.9). This could result in a out of bound array access fdtab[-1] and would crash. Must be backported in every maintained versions. (cherry picked from commit cc5b9fa593e139fa330f8c7161ff7514315f2837) Signed-off-by: William Lallemand (cherry picked from commit 49650955ae6a581226aeb7186650f4a45643a7ce) Signed-off-by: William Lallemand --- src/mworker.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mworker.c b/src/mworker.c index 65eb89921..686fc755d 100644 --- a/src/mworker.c +++ b/src/mworker.c @@ -463,6 +463,9 @@ static int mworker_pipe_register_per_thread() if (tid != 0) return 1; + if (proc_self->ipc_fd[1] < 0) /* proc_self was incomplete and we can't find the socketpair */ + return 1; + fd_set_nonblock(proc_self->ipc_fd[1]); /* In multi-tread, we need only one thread to process * events on the pipe with master From 978688bd26b1d704e6225ab88133986cd3cb34db Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Tue, 21 Feb 2023 14:07:05 +0100 Subject: [PATCH 040/140] MINOR: startup: HAPROXY_STARTUP_VERSION contains the version used to start HAPROXY_STARTUP_VERSION: contains the version used to start, in master-worker mode this is the version which was used to start the master, even after updating the binary and reloading. This patch could be backported in every version since it is useful when debugging. (cherry picked from commit d4c0be6b20f5365dc00221b51face2a32cf4053b) Signed-off-by: William Lallemand (cherry picked from commit e0fb690416d2a5d383b427426caf3ebf23ca0d4a) Signed-off-by: William Lallemand --- doc/configuration.txt | 4 ++++ src/haproxy.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/doc/configuration.txt b/doc/configuration.txt index 25b537686..4015ab76f 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -763,6 +763,10 @@ file, or could be inherited by a program (See 3.7. Programs): * HAPROXY_MASTER_CLI: In master-worker mode, listeners addresses of the master CLI, separated by semicolons. +* HAPROXY_STARTUP_VERSION: contains the version used to start, in master-worker + mode this is the version which was used to start the master, even after + updating the binary and reloading. + In addition, some pseudo-variables are internally resolved and may be used as regular variables. Pseudo-variables always start with a dot ('.'), and are the only ones where the dot is permitted. The current list of pseudo-variables is: diff --git a/src/haproxy.c b/src/haproxy.c index 809c9eb05..5a3f6c755 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -1490,6 +1490,8 @@ static void init_early(int argc, char **argv) char *tmp; int len; + setenv("HAPROXY_STARTUP_VERSION", HAPROXY_VERSION, 0); + /* First, let's initialize most global variables */ totalconn = actconn = listeners = stopping = 0; killed = pid = 0; From 1f1d3355300346195325926172f9e576e7292035 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 9 Feb 2023 15:26:25 +0100 Subject: [PATCH 041/140] BUG/MINOR: lua/httpclient: missing free in hlua_httpclient_send() In hlua_httpclient_send(), we replace hc->req.url with a new url. But we forgot to free the original url that was allocated in hlua_httpclient_new() or in the previous httpclient_send() call. Because of this, each httpclient request performed under lua scripts would result in a small leak. When stress-testing a lua action which uses httpclient, the leak is clearly visible since we're leaking severals Mbytes per minute. This bug was discovered by chance when trying to reproduce GH issue #2037. It must be backported up to 2.5 (cherry picked from commit 035640733251b7b8ce0df80f5d7429c3cb19d8f9) Signed-off-by: William Lallemand (cherry picked from commit aa7b63e59dc922a98e0f77e3dbde2102de1d26cc) Signed-off-by: William Lallemand --- src/hlua.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hlua.c b/src/hlua.c index 48355a11a..1f2448597 100644 --- a/src/hlua.c +++ b/src/hlua.c @@ -7383,6 +7383,7 @@ __LJMP static int hlua_httpclient_send(lua_State *L, enum http_meth_t meth) hlua_hc->sent = 0; + istfree(&hlua_hc->hc->req.url); hlua_hc->hc->req.url = istdup(ist(url_str)); hlua_hc->hc->req.meth = meth; From 4cf17404506d123990eb26101707a663e7bf2a80 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Thu, 9 Feb 2023 17:02:57 +0100 Subject: [PATCH 042/140] BUG/MEDIUM: httpclient/lua: fix a race between lua GC and hlua_ctx_destroy In bb581423b ("BUG/MEDIUM: httpclient/lua: crash when the lua task timeout before the httpclient"), a new logic was implemented to make sure that when a lua ctx destroyed, related httpclients are correctly destroyed too to prevent a such httpclients from being resuscitated on a destroyed lua ctx. This was implemented by adding a list of httpclients within the lua ctx, and a new function, hlua_httpclient_destroy_all(), that is called under hlua_ctx_destroy() and runs through the httpclients list in the lua context to properly terminate them. This was done with the assumption that no concurrent Lua garbage collection cycles could occur on the same ressources, which seems OK since the "lua" context is about to be freed and is not explicitly being used by other threads. But when 'lua-load' is used, the main lua stack is shared between multiple OS threads, which means that all lua ctx in the process are linked to the same parent stack. Yet it seems that lua GC, which can be triggered automatically under lua_resume() or manually through lua_gc(), does not limit itself to the "coroutine" stack (the stack referenced in lua->T) when performing the cleanup, but is able to perform some cleanup on the main stack plus coroutines stacks that were created under the same main stack (via lua_newthread()) as well. This can be explained by the fact that lua_newthread() coroutines are not meant to be thread-safe by design. Source: http://lua-users.org/lists/lua-l/2011-07/msg00072.html (lua co-author) It did not cause other issues so far because most of the time when using 'lua-load', the global lua lock is taken when performing critical operations that are known to interfere with the main stack. But here in hlua_httpclient_destroy_all(), we don't run under the global lock. Now that we properly understand the issue, the fix is pretty trivial: We could simply guard the hlua_httpclient_destroy_all() under the global lua lock, this would work but it could increase the contention over the global lock. Instead, we switched 'lua->hc_list' which was introduced with bb581423b from simple list to mt_list so that concurrent accesses between hlua_httpclient_destroy_all and hlua_httpclient_gc() are properly handled. The issue was reported by @Mark11122 on Github #2037. This must be backported with bb581423b ("BUG/MEDIUM: httpclient/lua: crash when the lua task timeout before the httpclient") as far as 2.5. (cherry picked from commit 3ffbf3896d6e6d6bef99763957fb67fb9023cce3) Signed-off-by: William Lallemand (cherry picked from commit 5a9a9efc52e8fd61e63ed54b29f50a7f78ca1131) Signed-off-by: William Lallemand --- include/haproxy/hlua-t.h | 4 ++-- src/hlua.c | 38 ++++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/include/haproxy/hlua-t.h b/include/haproxy/hlua-t.h index bc8c8df73..abcb29fad 100644 --- a/include/haproxy/hlua-t.h +++ b/include/haproxy/hlua-t.h @@ -111,7 +111,7 @@ struct hlua { struct task *task; /* The task associated with the lua stack execution. We must wake this task to continue the task execution */ struct list com; /* The list head of the signals attached to this task. */ - struct list hc_list; /* list of httpclient associated to this lua task */ + struct mt_list hc_list; /* list of httpclient associated to this lua task */ struct ebpt_node node; int gc_count; /* number of items which need a GC */ }; @@ -199,7 +199,7 @@ struct hlua_httpclient { struct httpclient *hc; /* ptr to the httpclient instance */ size_t sent; /* payload sent */ luaL_Buffer b; /* buffer used to prepare strings. */ - struct list by_hlua; /* linked in the current hlua task */ + struct mt_list by_hlua; /* linked in the current hlua task */ }; #else /* USE_LUA */ diff --git a/src/hlua.c b/src/hlua.c index 1f2448597..17814775f 100644 --- a/src/hlua.c +++ b/src/hlua.c @@ -1220,7 +1220,7 @@ int hlua_ctx_init(struct hlua *lua, int state_id, struct task *task, int already lua->wake_time = TICK_ETERNITY; lua->state_id = state_id; LIST_INIT(&lua->com); - LIST_INIT(&lua->hc_list); + MT_LIST_INIT(&lua->hc_list); if (!already_safe) { if (!SET_SAFE_LJMP_PARENT(lua)) { lua->Tref = LUA_REFNIL; @@ -1242,16 +1242,30 @@ int hlua_ctx_init(struct hlua *lua, int state_id, struct task *task, int already return 1; } -/* kill all associated httpclient to this hlua task */ +/* kill all associated httpclient to this hlua task + * We must take extra precautions as we're manipulating lua-exposed + * objects without the main lua lock. + */ static void hlua_httpclient_destroy_all(struct hlua *hlua) { - struct hlua_httpclient *hlua_hc, *back; + struct hlua_httpclient *hlua_hc; - list_for_each_entry_safe(hlua_hc, back, &hlua->hc_list, by_hlua) { - if (hlua_hc->hc) - httpclient_stop_and_destroy(hlua_hc->hc); + /* use thread-safe accessors for hc_list since GC cycle initiated by + * another thread sharing the same main lua stack (lua coroutine) + * could execute hlua_httpclient_gc() on the hlua->hc_list items + * in parallel: Lua GC applies on the main stack, it is not limited to + * a single coroutine stack, see Github issue #2037 for reference. + * Remember, coroutines created using lua_newthread() are not meant to + * be thread safe in Lua. (From lua co-author: + * http://lua-users.org/lists/lua-l/2011-07/msg00072.html) + * + * This security measure is superfluous when 'lua-load-per-thread' is used + * since in this case coroutines exclusively run on the same thread + * (main stack is not shared between OS threads). + */ + while ((hlua_hc = MT_LIST_POP(&hlua->hc_list, typeof(hlua_hc), by_hlua))) { + httpclient_stop_and_destroy(hlua_hc->hc); hlua_hc->hc = NULL; - LIST_DEL_INIT(&hlua_hc->by_hlua); } } @@ -7026,11 +7040,11 @@ __LJMP static int hlua_httpclient_gc(lua_State *L) hlua_hc = MAY_LJMP(hlua_checkhttpclient(L, 1)); - if (hlua_hc->hc) + if (MT_LIST_DELETE(&hlua_hc->by_hlua)) { + /* we won the race against hlua_httpclient_destroy_all() */ httpclient_stop_and_destroy(hlua_hc->hc); - - hlua_hc->hc = NULL; - LIST_DEL_INIT(&hlua_hc->by_hlua); + hlua_hc->hc = NULL; + } return 0; } @@ -7061,7 +7075,7 @@ __LJMP static int hlua_httpclient_new(lua_State *L) if (!hlua_hc->hc) goto err; - LIST_APPEND(&hlua->hc_list, &hlua_hc->by_hlua); + MT_LIST_APPEND(&hlua->hc_list, &hlua_hc->by_hlua); /* Pop a class stream metatable and affect it to the userdata. */ lua_rawgeti(L, LUA_REGISTRYINDEX, class_httpclient_ref); From 7b4e6216d8dfedca36af7545b564958e624c5ea1 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 2 Mar 2023 15:05:31 +0100 Subject: [PATCH 043/140] MINOR: fd/cli: report the polling mask in "show fd" It's missing and often needed when trying to debug a situation, let's report the polling mask as well in "show fd". (cherry picked from commit 677c006c5c66db1b421eaac81926407e7ec686da) Signed-off-by: Willy Tarreau (cherry picked from commit bded43308f1d34f5625dfb02b99502a244576112) [wt: adjust ctx] Signed-off-by: Willy Tarreau --- src/cli.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cli.c b/src/cli.c index 14e82f601..54d0b2712 100644 --- a/src/cli.c +++ b/src/cli.c @@ -1333,7 +1333,7 @@ static int cli_io_handler_show_fd(struct appctx *appctx) suspicious = 1; chunk_printf(&trash, - " %5d : st=0x%06x(%c%c %c%c%c%c%c W:%c%c%c R:%c%c%c) tmask=0x%lx umask=0x%lx owner=%p iocb=%p(", + " %5d : st=0x%06x(%c%c %c%c%c%c%c W:%c%c%c R:%c%c%c) tmask=0x%lx umask=0x%lx prmsk=0x%lx pwmsk=0x%lx owner=%p iocb=%p(", fd, fdt.state, (fdt.state & FD_CLONED) ? 'C' : 'c', @@ -1350,6 +1350,8 @@ static int cli_io_handler_show_fd(struct appctx *appctx) (fdt.state & FD_EV_READY_R) ? 'R' : 'r', (fdt.state & FD_EV_ACTIVE_R) ? 'A' : 'a', fdt.thread_mask, fdt.update_mask, + polled_mask[fd].poll_recv, + polled_mask[fd].poll_send, fdt.owner, fdt.iocb); resolve_sym_name(&trash, NULL, fdt.iocb); From 7f35721face689026fe66126bbf702f857514dcd Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 14 Feb 2023 11:01:51 +0100 Subject: [PATCH 044/140] BUG/MEDIUM: stconn: Don't rearm the read expiration date if EOI was reached At the stream level, the read expiration date is unset if a shutr was received but not if the end of input was reached. If we know no more data are excpected, there is no reason to let the read expiration date armed, except to respect clientfin/serverfin timeout on some circumstances. This patch could slowly be backported as far as 2.2. (cherry picked from commit 407210a34d781f8249504557c371c170cb34f93e) Signed-off-by: Christopher Faulet (cherry picked from commit 5eb0053ce16d1b9242eece90ae351afcdc7592e2) Signed-off-by: Willy Tarreau --- src/stconn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/stconn.c b/src/stconn.c index 0dd110ea6..3690f4fb6 100644 --- a/src/stconn.c +++ b/src/stconn.c @@ -1056,7 +1056,7 @@ void sc_update_rx(struct stconn *sc) */ sc_have_room(sc); } - if (sc->flags & (SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM)) + if ((ic->flags & CF_EOI) || sc->flags & (SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM)) ic->rex = TICK_ETERNITY; else if (!(ic->flags & CF_READ_NOEXP) && !tick_isset(ic->rex)) ic->rex = tick_add_ifset(now_ms, ic->rto); @@ -1204,7 +1204,7 @@ static void sc_notify(struct stconn *sc) sc_chk_rcv(sc); sc_chk_rcv(sco); - if (ic->flags & CF_SHUTR || sc_ep_test(sc, SE_FL_APPLET_NEED_CONN) || + if (ic->flags & (CF_EOI|CF_SHUTR) || sc_ep_test(sc, SE_FL_APPLET_NEED_CONN) || (sc->flags & (SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM))) { ic->rex = TICK_ETERNITY; } From c38d665d0480b08c7c553a6e6ecb80afae0463e2 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 16 Feb 2023 09:07:00 +0100 Subject: [PATCH 045/140] BUG/MINOR: sched: properly report long_rq when tasks remain in the queue There's a per-thread "long_rq" counter that is used to indicate how often we leave the scheduler with tasks still present in the run queue. The purpose is to know when tune.runqueue-depth served to limit latency, due to a large number of tasks being runnable at once. However there's a bug there, it's not always set: if after the first run, one heavy task was processed and later only heavy tasks remain, we'll loop back to not_done_yet where we try to pick more tasks, but none are eligible (since heavy ones have already run) so we directly return without incrementing the counter. This is what causes ultra-low values on long_rq during massive SSL handshakes, that are confusing because they make one believe that tl_class_mask doesn't have the HEAVY flag anymore. Let's just fix that by not returning from the middle of the function. This can be backported as far as 2.4. (cherry picked from commit 2e270cf0b0824fb2b83f2ee737a75272687ba9c4) Signed-off-by: Christopher Faulet (cherry picked from commit 31d3ddb23eea3523604c02f9cec8948bbe425829) Signed-off-by: Willy Tarreau --- src/task.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/task.c b/src/task.c index 17200312a..2d512be78 100644 --- a/src/task.c +++ b/src/task.c @@ -818,7 +818,7 @@ void process_runnable_tasks() */ max_total = max[TL_URGENT] + max[TL_NORMAL] + max[TL_BULK] + max[TL_HEAVY]; if (!max_total) - return; + goto leave; for (queue = 0; queue < TL_CLASSES; queue++) max[queue] = ((unsigned)max_processed * max[queue] + max_total - 1) / max_total; @@ -928,6 +928,7 @@ void process_runnable_tasks() if (max_processed > 0 && thread_has_tasks()) goto not_done_yet; + leave: if (tt->tl_class_mask) activity[tid].long_rq++; } From a7e743d153a5cb65164478c5e9d835eb88422e54 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 16 Feb 2023 09:19:21 +0100 Subject: [PATCH 046/140] BUG/MEDIUM: sched: allow a bit more TASK_HEAVY to be processed when needed As reported in github issue #1881, there are situations where an excess of TLS handshakes can cause a livelock. What's happening is that normally we process at most one TLS handshake per loop iteration to maintain the latency low. This is done by tagging them with TASK_HEAVY, queuing these tasklets in the TL_HEAVY queue. But if something slows down the loop, such as a connect() call when no more ports are available, we could end up processing no more than a few hundred or thousands handshakes per second. If the llmit becomes lower than the rate of incoming handshakes, we will accumulate them and at some point users will get impatient and give up or retry. Then a new problem happens: the queue fills up with even more handshake attempts, only one of which will be handled per iteration, so we can end up processing only outdated handshakes at a low rate, with basically nothing else in the queue. This can for example happen in parallel with health checks that don't require incoming handshakes to succeed to continue to cause some activity that could maintain the high latency stuff active. Here we're taking a slightly different approach. First, instead of always allowing only one handshake per loop (and usually it's critical for latency), we take the current situation into account: - if configured with tune.sched.low-latency, the limit remains 1 - if there are other non-heavy tasks, we set the limit to 1 + one per 1024 tasks, so that a heavily loaded queue of 4k handshakes per thread will be able to drain them at ~4 per loops with a limited impact on latency - if there are no other tasks, the limit grows to 1 + one per 128 tasks, so that a heavily loaded queue of 4k handshakes per thread will be able to drain them at ~32 per loop with still a very limited impact on latency since only I/O will get delayed. It was verified on a 56-core Xeon-8480 that this did not degrade the latency; all requests remained below 1ms end-to-end in full close+ handshake, and even 500us under low-lat + busy-polling. This must be backported to 2.4. (cherry picked from commit ba4c7a15978deaf74b6af09d2a13b4fff7ccea74) Signed-off-by: Christopher Faulet (cherry picked from commit e5713fb24194166e273ece9c58eddfad8ca39627) Signed-off-by: Willy Tarreau --- src/task.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/task.c b/src/task.c index 2d512be78..a926f4cdd 100644 --- a/src/task.c +++ b/src/task.c @@ -823,11 +823,26 @@ void process_runnable_tasks() for (queue = 0; queue < TL_CLASSES; queue++) max[queue] = ((unsigned)max_processed * max[queue] + max_total - 1) / max_total; - /* The heavy queue must never process more than one task at once - * anyway. + /* The heavy queue must never process more than very few tasks at once + * anyway. We set the limit to 1 if running on low_latency scheduling, + * given that we know that other values can have an impact on latency + * (~500us end-to-end connection achieved at 130kcps in SSL), 1 + one + * per 1024 tasks if there is at least one non-heavy task while still + * respecting the ratios above, or 1 + one per 128 tasks if only heavy + * tasks are present. This allows to drain excess SSL handshakes more + * efficiently if the queue becomes congested. */ - if (max[TL_HEAVY] > 1) - max[TL_HEAVY] = 1; + if (max[TL_HEAVY] > 1) { + if (global.tune.options & GTUNE_SCHED_LOW_LATENCY) + budget = 1; + else if (tt->tl_class_mask & ~(1 << TL_HEAVY)) + budget = 1 + tt->rq_total / 1024; + else + budget = 1 + tt->rq_total / 128; + + if (max[TL_HEAVY] > budget) + max[TL_HEAVY] = budget; + } lrq = grq = NULL; From cb9a8fdbaef4a642eeb84ac9feaf636720d18360 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Fri, 17 Feb 2023 16:23:52 +0100 Subject: [PATCH 047/140] BUG/MINOR: mworker: prevent incorrect values in uptime Since the recent changes on the clocks, now.tv_sec is not to be used between processes because it's a clock which is local to the process and does not contain a real unix timestamp. This patch fixes the issue by using "data.tv_sec" which is the wall clock instead of "now.tv_sec'. It prevents having incoherent timestamps. It also introduces some checks on negatives values in order to never displays a netative value if it was computed from a wrong value set by a previous haproxy version. It must be backported as far as 2.0. (cherry picked from commit 5a7f83af84d2a08f69ce1629c7609c98f43411ab) Signed-off-by: Christopher Faulet (cherry picked from commit 7b8337e0cbaeebec26eab0062f0706f9388e3e2c) Signed-off-by: Willy Tarreau --- src/haproxy.c | 2 +- src/mworker.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/haproxy.c b/src/haproxy.c index 5a3f6c755..0cb0662d2 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -3432,7 +3432,7 @@ int main(int argc, char **argv) if (child->reloads == 0 && child->options & PROC_O_TYPE_WORKER && child->pid == -1) { - child->timestamp = now.tv_sec; + child->timestamp = date.tv_sec; child->pid = ret; child->version = strdup(haproxy_version); break; diff --git a/src/mworker.c b/src/mworker.c index 686fc755d..26b16cca4 100644 --- a/src/mworker.c +++ b/src/mworker.c @@ -559,13 +559,16 @@ static int cli_io_handler_show_proc(struct appctx *appctx) struct stconn *sc = appctx_sc(appctx); struct mworker_proc *child; int old = 0; - int up = now.tv_sec - proc_self->timestamp; + int up = date.tv_sec - proc_self->timestamp; char *uptime = NULL; char *reloadtxt = NULL; if (unlikely(sc_ic(sc)->flags & (CF_WRITE_ERROR|CF_SHUTW))) return 1; + if (up < 0) /* must never be negative because of clock drift */ + up = 0; + chunk_reset(&trash); memprintf(&reloadtxt, "%d [failed: %d]", proc_self->reloads, proc_self->failedreloads); @@ -579,7 +582,9 @@ static int cli_io_handler_show_proc(struct appctx *appctx) chunk_appendf(&trash, "# workers\n"); list_for_each_entry(child, &proc_list, list) { - up = now.tv_sec - child->timestamp; + up = date.tv_sec - child->timestamp; + if (up < 0) /* must never be negative because of clock drift */ + up = 0; if (!(child->options & PROC_O_TYPE_WORKER)) continue; @@ -600,7 +605,9 @@ static int cli_io_handler_show_proc(struct appctx *appctx) chunk_appendf(&trash, "# old workers\n"); list_for_each_entry(child, &proc_list, list) { - up = now.tv_sec - child->timestamp; + up = date.tv_sec - child->timestamp; + if (up <= 0) /* must never be negative because of clock drift */ + up = 0; if (!(child->options & PROC_O_TYPE_WORKER)) continue; @@ -618,7 +625,9 @@ static int cli_io_handler_show_proc(struct appctx *appctx) chunk_appendf(&trash, "# programs\n"); old = 0; list_for_each_entry(child, &proc_list, list) { - up = now.tv_sec - child->timestamp; + up = date.tv_sec - child->timestamp; + if (up < 0) /* must never be negative because of clock drift */ + up = 0; if (!(child->options & PROC_O_TYPE_PROG)) continue; @@ -635,7 +644,9 @@ static int cli_io_handler_show_proc(struct appctx *appctx) if (old) { chunk_appendf(&trash, "# old programs\n"); list_for_each_entry(child, &proc_list, list) { - up = now.tv_sec - child->timestamp; + up = date.tv_sec - child->timestamp; + if (up < 0) /* must never be negative because of clock drift */ + up = 0; if (!(child->options & PROC_O_TYPE_PROG)) continue; From df523a6c161b234ba9da4b147a5389299bfa6aab Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 20 Feb 2023 16:57:47 +0100 Subject: [PATCH 048/140] MINOR: mux-h2/traces: do not log h2s pointer for dummy streams Functions which are called with dummy streams pass it down the traces and that leads to somewhat confusing "h2s=0x1234568(0,IDL)" for example while the nature of the called function makes this stream useless at that place. Better not report a random pointer, especially since it always requires to look at the code before remembering how this should be interpreted. Now what we're doing is that the idle stream only prints "h2s=IDL" which is shorter and doesn't report a pointer, closed stream do not report anything since the stream ID 0 already implies it, and other ones are reported normally. This could be backported to 2.7 and 2.6 as it improves traces legibility. (cherry picked from commit f9f4499429e678ea648b5fe587662d44402c4ea6) Signed-off-by: Christopher Faulet (cherry picked from commit bae16e96db6b7b3ae35f53d147317ac90ad6fb84) Signed-off-by: Willy Tarreau --- src/mux_h2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mux_h2.c b/src/mux_h2.c index ec5141bc4..08f361d15 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -655,7 +655,10 @@ static void h2_trace(enum trace_level level, uint64_t mask, const struct trace_s if (h2s) { if (h2s->id <= 0) chunk_appendf(&trace_buf, " dsi=%d", h2c->dsi); - chunk_appendf(&trace_buf, " h2s=%p(%d,%s)", h2s, h2s->id, h2s_st_to_str(h2s->st)); + if (h2s == h2_idle_stream) + chunk_appendf(&trace_buf, " h2s=IDL"); + else if (h2s != h2_closed_stream) + chunk_appendf(&trace_buf, " h2s=%p(%d,%s)", h2s, h2s->id, h2s_st_to_str(h2s->st)); if (h2s->id && h2s->errcode) chunk_appendf(&trace_buf, " err=%s/%02x", h2_err_str(h2s->errcode), h2s->errcode); } From 670e5450ce2fd9622184f183ec713b15d7d4e55f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Mon, 20 Feb 2023 17:05:10 +0100 Subject: [PATCH 049/140] MINOR: mux-h2/traces: add a missing TRACE_LEAVE() in h2s_frt_handle_headers() Traces from this function would miss a TRACE_LEAVE() on the success path, which had for consequences, 1) that it was difficult to figure where the function was left, and 2) that we never had the allocated stream ID clearly visible (actually the one returned by h2c_frt_stream_new() is the right one but it's not obvious). This can be backported to 2.7 and 2.6. (cherry picked from commit 0d6e5d271f76a75b54b13459a3f5c86117075142) Signed-off-by: Christopher Faulet (cherry picked from commit bf66313d9465f3620109d1bbff13aff1c09b400e) Signed-off-by: Willy Tarreau --- src/mux_h2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mux_h2.c b/src/mux_h2.c index 08f361d15..94017e9bc 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -2905,6 +2905,7 @@ static struct h2s *h2c_frt_handle_headers(struct h2c *h2c, struct h2s *h2s) else h2s_close(h2s); } + TRACE_LEAVE(H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, h2s); return h2s; conn_err: From 31a2f615b9902db2d178486ddb0d7d6e3fff552b Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 21 Feb 2023 11:24:04 +0100 Subject: [PATCH 050/140] REGTESTS: Fix ssl_errors.vtc script to wait for connections close In this scripts, several clients perform a requests and exit because an SSL error is expected and thus no response is sent. However, we must explicitly wait for the connection close, via an "expect_close" statement. Otherwise, depending on the timing, HAProxy may detect the client abort before any connection attempt on the server side and no SSL error is reported, making the script to fail. (cherry picked from commit 4ad6ee94ab6d4b0d9d51065c8e9078ca018d4d1f) Signed-off-by: Christopher Faulet (cherry picked from commit 97f2e16493eee963455e944945808e6cf756116b) Signed-off-by: Willy Tarreau --- reg-tests/ssl/ssl_errors.vtc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/reg-tests/ssl/ssl_errors.vtc b/reg-tests/ssl/ssl_errors.vtc index 45a22c153..8fb9c5a12 100644 --- a/reg-tests/ssl/ssl_errors.vtc +++ b/reg-tests/ssl/ssl_errors.vtc @@ -327,14 +327,17 @@ shell { client c4 -connect ${h1_clearlst_sock} { txreq + expect_close } -run client c5 -connect ${h1_clearlst_sock} { txreq + expect_close } -run client c6 -connect ${h1_clearlst_sock} { txreq + expect_close } -run barrier b1 sync @@ -355,14 +358,17 @@ shell { client c7 -connect ${h1_clearlst_sock} { txreq + expect_close } -run client c8 -connect ${h1_clearlst_sock} { txreq + expect_close } -run client c9 -connect ${h1_clearlst_sock} { txreq + expect_close } -run barrier b1 sync @@ -377,12 +383,15 @@ shell { # "No shared cipher" errors client c10 -connect ${h1_wrongcipherslst_sock} { txreq + expect_close } -run client c11 -connect ${h1_wrongcipherslst_sock} { txreq + expect_close } -run client c12 -connect ${h1_wrongcipherslst_sock} { txreq + expect_close } -run @@ -399,22 +408,27 @@ client c13 -connect ${h1_backenderrorslst_sock} { barrier b2 sync client c14 -connect ${h1_backenderrorslst_sock} { txreq + expect_close } -run barrier b2 sync client c15 -connect ${h1_backenderrorslst_sock} { txreq + expect_close } -run barrier b2 sync client c16 -connect ${h1_backenderrorslst_sock} { txreq + expect_close } -run barrier b2 sync client c17 -connect ${h1_backenderrorslst_sock} { txreq + expect_close } -run barrier b2 sync client c18 -connect ${h1_backenderrorslst_sock} { txreq + expect_close } -run syslog Slg_cust_fmt -wait From 5c3b4965e2f0af046c4df40e39fb3d21860f5de2 Mon Sep 17 00:00:00 2001 From: Remi Tricot-Le Breton Date: Tue, 21 Feb 2023 11:47:17 +0100 Subject: [PATCH 051/140] BUG/MINOR: cache: Cache response even if request has "no-cache" directive Since commit cc9bf2e5f "MEDIUM: cache: Change caching conditions" responses that do not have an explicit expiration time are not cached anymore. But this mechanism wrongly used the TX_CACHE_IGNORE flag instead of the TX_CACHEABLE one. The effect this had is that a cacheable response that corresponded to a request having a "Cache-Control: no-cache" for instance would not be cached. Contrary to what was said in the other commit message, the "checkcache" option should not be impacted by the use of the TX_CACHEABLE flag instead of the TX_CACHE_IGNORE one. The response is indeed considered as not cacheable if it has no expiration time, regardless of the presence of a cookie in the response. This should fix GitHub issue #2048. This patch can be backported up to branch 2.4. (cherry picked from commit 879debeecb93202c983f25f5f1d765e74d77faa5) Signed-off-by: Christopher Faulet (cherry picked from commit c6dbdbc94329c02402f2560299ca97f0c5a82e49) Signed-off-by: Willy Tarreau --- include/haproxy/http_ana-t.h | 2 +- reg-tests/cache/caching_rules.vtc | 32 +++++++++++++++++++++++++++++++ src/cache.c | 2 +- src/http_ana.c | 2 +- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/include/haproxy/http_ana-t.h b/include/haproxy/http_ana-t.h index b267ebebd..24e1b3989 100644 --- a/include/haproxy/http_ana-t.h +++ b/include/haproxy/http_ana-t.h @@ -59,7 +59,7 @@ /* cacheability management, bits values 0x1000 to 0x3000 (0-3 shift 12) */ #define TX_CACHEABLE 0x00001000 /* at least part of the response is cacheable */ #define TX_CACHE_COOK 0x00002000 /* a cookie in the response is cacheable */ -#define TX_CACHE_IGNORE 0x00004000 /* do not retrieve object from cache, or avoid caching response */ +#define TX_CACHE_IGNORE 0x00004000 /* do not retrieve object from cache */ #define TX_CACHE_SHIFT 12 /* bit shift */ #define TX_CON_WANT_TUN 0x00008000 /* Will be a tunnel (CONNECT or 101-Switching-Protocol) */ diff --git a/reg-tests/cache/caching_rules.vtc b/reg-tests/cache/caching_rules.vtc index b1ea7f979..61274b4b5 100644 --- a/reg-tests/cache/caching_rules.vtc +++ b/reg-tests/cache/caching_rules.vtc @@ -67,6 +67,18 @@ server s1 { txresp -hdr "Cache-Control: max-age=500" \ -hdr "Age: 100" -bodylen 140 + + # "Control-Cache: no-cache" on client request but still stored in cache + rxreq + expect req.url == "/nocache" + txresp -hdr "Cache-Control: max-age=500" \ + -hdr "Age: 100" -bodylen 140 + + rxreq + expect req.url == "/nocache" + txresp -hdr "Cache-Control: max-age=500" \ + -hdr "Age: 100" -bodylen 140 + } -start server s2 { @@ -221,4 +233,24 @@ client c1 -connect ${h1_fe_sock} { expect resp.bodylen == 140 expect resp.http.X-Cache-Hit == 1 + # Cache-Control: no-cache + txreq -url "/nocache" -hdr "Cache-Control: no-cache" + rxresp + expect resp.status == 200 + expect resp.bodylen == 140 + expect resp.http.X-Cache-Hit == 0 + + txreq -url "/nocache" -hdr "Cache-Control: no-cache" + rxresp + expect resp.status == 200 + expect resp.bodylen == 140 + expect resp.http.X-Cache-Hit == 0 + + txreq -url "/nocache" + rxresp + expect resp.status == 200 + expect resp.bodylen == 140 + expect resp.http.X-Cache-Hit == 1 + + } -run diff --git a/src/cache.c b/src/cache.c index 4992b5e22..adb1aa24c 100644 --- a/src/cache.c +++ b/src/cache.c @@ -1101,7 +1101,7 @@ enum act_return http_action_store_cache(struct act_rule *rule, struct proxy *px, http_check_response_for_cacheability(s, &s->res); - if (!(txn->flags & TX_CACHEABLE) || !(txn->flags & TX_CACHE_COOK) || (txn->flags & TX_CACHE_IGNORE)) + if (!(txn->flags & TX_CACHEABLE) || !(txn->flags & TX_CACHE_COOK)) goto out; shctx_lock(shctx); diff --git a/src/http_ana.c b/src/http_ana.c index d9f9201ca..13f9641bd 100644 --- a/src/http_ana.c +++ b/src/http_ana.c @@ -3952,7 +3952,7 @@ void http_check_response_for_cacheability(struct stream *s, struct channel *res) /* We won't store an entry that has neither a cache validator nor an * explicit expiration time, as suggested in RFC 7234#3. */ if (!has_freshness_info && !has_validator) - txn->flags |= TX_CACHE_IGNORE; + txn->flags &= ~TX_CACHEABLE; } /* From ab90e1cbd4cfcaeb0598c2935fbec71ceafa6c59 Mon Sep 17 00:00:00 2001 From: Remi Tricot-Le Breton Date: Tue, 21 Feb 2023 17:42:04 +0100 Subject: [PATCH 052/140] BUG/MINOR: cache: Check cache entry is complete in case of Vary Before looking for a secondary cache entry for a given request we checked that the first entry was complete, which might prevent us from using a valid entry if the first one with the same primary key is not full yet. Likewise, if the primary entry is complete but not the secondary entry we try to use, we might end up using a partial entry from the cache as a response. This bug was raised in GitHub #2048. It can be backported up to branch 2.4. (cherry picked from commit 25917cdb12412378a80e755ffc18b5cb67c36fd2) Signed-off-by: Christopher Faulet (cherry picked from commit 4208eb8c0fdf6fae47f10e38c9aeeba6b811cfc1) Signed-off-by: Willy Tarreau --- src/cache.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/cache.c b/src/cache.c index adb1aa24c..1f98ea52b 100644 --- a/src/cache.c +++ b/src/cache.c @@ -1802,8 +1802,10 @@ enum act_return http_action_req_cache_use(struct act_rule *rule, struct proxy *p shctx_lock(shctx_ptr(cache)); res = entry_exist(cache, s->txn->cache_hash); - /* We must not use an entry that is not complete. */ - if (res && res->complete) { + /* We must not use an entry that is not complete but the check will be + * performed after we look for a potential secondary entry (in case of + * Vary). */ + if (res) { struct appctx *appctx; entry_block = block_ptr(res); shctx_row_inc_hot(shctx_ptr(cache), entry_block); @@ -1830,9 +1832,11 @@ enum act_return http_action_req_cache_use(struct act_rule *rule, struct proxy *p res = NULL; } - /* We looked for a valid secondary entry and could not find one, - * the request must be forwarded to the server. */ - if (!res) { + /* We either looked for a valid secondary entry and could not + * find one, or the entry we want to use is not complete. We + * can't use the cache's entry and must forward the request to + * the server. */ + if (!res || !res->complete) { shctx_lock(shctx_ptr(cache)); shctx_row_dec_hot(shctx_ptr(cache), entry_block); shctx_unlock(shctx_ptr(cache)); From d203c0112b7ae1c8ca155d3fb710d6910b71e104 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 22 Feb 2023 15:36:03 +0100 Subject: [PATCH 053/140] BUG/MINOR: ring: do not realign ring contents on resize If a ring is resized, we must not zero its head since the contents are preserved in-situ. Till now it used to work because we only resize during boot and we emit very few data (if at all) during boot. But this can change in the future. This can be backported to 2.2 though no older version should notice a difference. (cherry picked from commit d0d85d2e364b1a8dffd605de18469c9f300aae32) Signed-off-by: Christopher Faulet (cherry picked from commit bc16d31f889336578fd6d1fb72c9306816df23f4) Signed-off-by: Willy Tarreau --- src/ring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ring.c b/src/ring.c index 089a2fc9f..e40a0680b 100644 --- a/src/ring.c +++ b/src/ring.c @@ -123,7 +123,6 @@ struct ring *ring_resize(struct ring *ring, size_t size) b_getblk(&ring->buf, area, ring->buf.data, 0); area = HA_ATOMIC_XCHG(&ring->buf.area, area); ring->buf.size = size; - ring->buf.head = 0; } HA_RWLOCK_WRUNLOCK(LOGSRV_LOCK, &ring->lock); From 4f0b1df09e27822a433b9ef5e3def57cff61cbf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Fri, 24 Feb 2023 09:47:07 +0100 Subject: [PATCH 054/140] BUILD: thead: Fix several 32 bits compilation issues with uint64_t variables Cast uint64_t as ullong and difference between two uint64_t as llong. (cherry picked from commit 83540ed429ae527650432a501728f4ed5d377f4b) Signed-off-by: Christopher Faulet (cherry picked from commit 87ae041fa43bd00a2827c44b147c5a9bb8581ee7) Signed-off-by: Willy Tarreau --- src/thread.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/thread.c b/src/thread.c index 7318de973..eb8e798c8 100644 --- a/src/thread.c +++ b/src/thread.c @@ -462,37 +462,37 @@ void show_lock_stats() if (lock_stats[lbl].num_write_locked) fprintf(stderr, - "\t # write lock : %lu\n" - "\t # write unlock: %lu (%ld)\n" + "\t # write lock : %llu\n" + "\t # write unlock: %llu (%lld)\n" "\t # wait time for write : %.3f msec\n" "\t # wait time for write/lock: %.3f nsec\n", - lock_stats[lbl].num_write_locked, - lock_stats[lbl].num_write_unlocked, - lock_stats[lbl].num_write_unlocked - lock_stats[lbl].num_write_locked, + (ullong)lock_stats[lbl].num_write_locked, + (ullong)lock_stats[lbl].num_write_unlocked, + (llong)(lock_stats[lbl].num_write_unlocked - lock_stats[lbl].num_write_locked), (double)lock_stats[lbl].nsec_wait_for_write / 1000000.0, lock_stats[lbl].num_write_locked ? ((double)lock_stats[lbl].nsec_wait_for_write / (double)lock_stats[lbl].num_write_locked) : 0); if (lock_stats[lbl].num_seek_locked) fprintf(stderr, - "\t # seek lock : %lu\n" - "\t # seek unlock : %lu (%ld)\n" + "\t # seek lock : %llu\n" + "\t # seek unlock : %llu (%lld)\n" "\t # wait time for seek : %.3f msec\n" "\t # wait time for seek/lock : %.3f nsec\n", - lock_stats[lbl].num_seek_locked, - lock_stats[lbl].num_seek_unlocked, - lock_stats[lbl].num_seek_unlocked - lock_stats[lbl].num_seek_locked, + (ullong)lock_stats[lbl].num_seek_locked, + (ullong)lock_stats[lbl].num_seek_unlocked, + (llong)(lock_stats[lbl].num_seek_unlocked - lock_stats[lbl].num_seek_locked), (double)lock_stats[lbl].nsec_wait_for_seek / 1000000.0, lock_stats[lbl].num_seek_locked ? ((double)lock_stats[lbl].nsec_wait_for_seek / (double)lock_stats[lbl].num_seek_locked) : 0); if (lock_stats[lbl].num_read_locked) fprintf(stderr, - "\t # read lock : %lu\n" - "\t # read unlock : %lu (%ld)\n" + "\t # read lock : %llu\n" + "\t # read unlock : %llu (%lld)\n" "\t # wait time for read : %.3f msec\n" "\t # wait time for read/lock : %.3f nsec\n", - lock_stats[lbl].num_read_locked, - lock_stats[lbl].num_read_unlocked, - lock_stats[lbl].num_read_unlocked - lock_stats[lbl].num_read_locked, + (ullong)lock_stats[lbl].num_read_locked, + (ullong)lock_stats[lbl].num_read_unlocked, + (llong)(lock_stats[lbl].num_read_unlocked - lock_stats[lbl].num_read_locked), (double)lock_stats[lbl].nsec_wait_for_read / 1000000.0, lock_stats[lbl].num_read_locked ? ((double)lock_stats[lbl].nsec_wait_for_read / (double)lock_stats[lbl].num_read_locked) : 0); } From b81bad8b0c8c0d495fed2728fe8c7b2c2c24af16 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Fri, 24 Feb 2023 16:49:06 +0100 Subject: [PATCH 055/140] BUG/MEDIUM: h1-htx: Never copy more than the max data allowed during parsing A bug during H1 data parsing may lead to copy more data than the maximum allowed. The bug is an overflow on this max threshold when it is lower than the size of an htx_blk structure. At first glance, it means it is possible to not respsect the buffer's reserve. So it may lead to rewrite errors but it may also block any progress on the stream if the compression is enabled. In this case, the channel buffer appears as full and the compression must wait for space to proceed. Outside of any bug, it is only possible when there are outgoing data to forward, so the compression filter just waits. Because of this bug, there is nothing to forward. The buffer is just full of input data. Thus nothing move and the stream is infinitly blocked. To fix the bug, we must be sure to be able to create an HTX block of 1 byte without exceeding the maximum allowed. This patch should fix the issue #2053. It must be backported as far as 2.5. (cherry picked from commit c9ec9bc8346783417fe732cc4d22a074dca263f1) Signed-off-by: Christopher Faulet (cherry picked from commit 50920603685d42aada0e71893618f5b4831ea42e) Signed-off-by: Willy Tarreau --- src/h1_htx.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/h1_htx.c b/src/h1_htx.c index 06b2345ec..cb4144883 100644 --- a/src/h1_htx.c +++ b/src/h1_htx.c @@ -417,6 +417,8 @@ static size_t h1_copy_msg_data(struct htx **dsthtx, struct buffer *srcbuf, size_ /* Be prepared to create at least one HTX block by reserving its size * and adjust accordingly. */ + if (max <= sizeof(struct htx_blk)) + goto end; max -= sizeof(struct htx_blk); if (count > max) count = max; @@ -507,8 +509,7 @@ static size_t h1_parse_chunk(struct h1m *h1m, struct htx **dsthtx, case H1_MSG_DATA: new_chunk: used = htx_used_space(*dsthtx); - - if (b_data(srcbuf) == ofs || !lmax) + if (b_data(srcbuf) == ofs || lmax <= sizeof(struct htx_blk)) break; sz = b_data(srcbuf) - ofs; @@ -588,6 +589,10 @@ static size_t h1_parse_full_contig_chunks(struct h1m *h1m, struct htx **dsthtx, uint64_t chksz; struct htx_ret htxret; + lmax = *max; + if (lmax <= sizeof(struct htx_blk)) + goto out; + /* source info : * start : pointer at position * end : pointer marking the end of data to parse @@ -616,7 +621,6 @@ static size_t h1_parse_full_contig_chunks(struct h1m *h1m, struct htx **dsthtx, * from . Then we must adjust it if it exceeds the free size in the * block. */ - lmax = *max; if (!dpos) lmax -= sizeof(struct htx_blk); if (lmax > htx_get_blksz(htxret.blk) - dpos) @@ -829,7 +833,7 @@ size_t h1_parse_msg_data(struct h1m *h1m, struct htx **dsthtx, { size_t sz, total = 0; - if (b_data(srcbuf) == ofs || !max) + if (b_data(srcbuf) == ofs || max <= sizeof(struct htx_blk)) return 0; if (h1m->flags & H1_MF_CLEN) { From e1f228fd27cf04632be028713964fc6cbfebfa76 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Mon, 20 Feb 2023 17:09:34 +0100 Subject: [PATCH 056/140] DOC: config: Fix description of options about HTTP connection modes Since the HTX, the decription of options about HTTP connection modes is wrong. In fact, it is worst, all the documentation about HTTP connection mode is wrong. But only options will be updated for now to be backported. So, documentation of "option httpclose", "option "http-keep-alive", "option http-server-close" and "option "http-pretend-keepalive" was reviewed. First, it is specify these options only concern HTT/1.x connections. Then, the descriptions were updated to reflect the HTX implementation. The main changes concerns the fact that server connections are no longer attached to client connections. The connection mode on one side does not affect the connection mode on the other side. It is especially true for t"option httpclose". For client connections, only the frontend option is considered and for server ones, both frontend and backend options are considered. This patch should be backported as far as 2.2. (cherry picked from commit 85523a02124f2b9dec7473e5f9f56cc9703be5c0) Signed-off-by: Christopher Faulet (cherry picked from commit 173475389b361f6c6363e33e1ed0f452e1560e90) Signed-off-by: Willy Tarreau --- doc/configuration.txt | 116 +++++++++++++++++++----------------------- 1 file changed, 51 insertions(+), 65 deletions(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index 4015ab76f..3bedddff6 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -8968,18 +8968,18 @@ no option http-ignore-probes option http-keep-alive no option http-keep-alive - Enable or disable HTTP keep-alive from client to server + Enable or disable HTTP keep-alive from client to server for HTTP/1.x + connections May be used in sections : defaults | frontend | listen | backend yes | yes | yes | yes Arguments : none By default HAProxy operates in keep-alive mode with regards to persistent - connections: for each connection it processes each request and response, and - leaves the connection idle on both sides between the end of a response and - the start of a new request. This mode may be changed by several options such - as "option http-server-close" or "option httpclose". This option allows to - set back the keep-alive mode, which can be useful when another mode was used - in a defaults section. + HTTP/1.x connections: for each connection it processes each request and + response, and leaves the connection idle on both sides. This mode may be + changed by several options such as "option http-server-close" or "option + httpclose". This option allows to set back the keep-alive mode, which can be + useful when another mode was used in a defaults section. Setting "option http-keep-alive" enables HTTP keep-alive mode on the client- and server- sides. This provides the lowest latency on the client side (slow @@ -8996,15 +8996,6 @@ no option http-keep-alive compared to the cost of retrieving the associated object from the server. This last case can happen when the server is a fast static server of cache. - In this case, the server will need to be properly tuned to support high enough - connection counts because connections will last until the client sends another - request. - - If the client request has to go to another backend or another server due to - content switching or the load balancing algorithm, the idle connection will - immediately be closed and a new one re-opened. Option "prefer-last-server" is - available to try optimize server selection so that if the server currently - attached to an idle connection is usable, it will be used. At the moment, logs will not indicate whether requests came from the same session or not. The accept date reported in the logs corresponds to the end @@ -9014,12 +9005,10 @@ no option http-keep-alive not set. This option disables and replaces any previous "option httpclose" or "option - http-server-close". When backend and frontend options differ, all of these 4 - options have precedence over "option http-keep-alive". + http-server-close". See also : "option httpclose",, "option http-server-close", - "option prefer-last-server", "option http-pretend-keepalive", - and "1.1. The HTTP transaction model". + "option prefer-last-server" and "option http-pretend-keepalive". option http-no-delay @@ -9058,19 +9047,19 @@ no option http-no-delay option http-pretend-keepalive no option http-pretend-keepalive - Define whether HAProxy will announce keepalive to the server or not + Define whether HAProxy will announce keepalive for HTTP/1.x connection to the + server or not May be used in sections : defaults | frontend | listen | backend yes | no | yes | yes Arguments : none When running with "option http-server-close" or "option httpclose", HAProxy - adds a "Connection: close" header to the request forwarded to the server. - Unfortunately, when some servers see this header, they automatically refrain - from using the chunked encoding for responses of unknown length, while this - is totally unrelated. The immediate effect is that this prevents HAProxy from - maintaining the client connection alive. A second effect is that a client or - a cache could receive an incomplete response without being aware of it, and - consider the response complete. + adds a "Connection: close" header to the HTTP/1.x request forwarded to the + server. Unfortunately, when some servers see this header, they automatically + refrain from using the chunked encoding for responses of unknown length, + while this is totally unrelated. The effect is that a client or a cache could + receive an incomplete response without being aware of it, and consider the + response complete. By setting "option http-pretend-keepalive", HAProxy will make the server believe it will keep the connection alive. The server will then not fall back @@ -9090,9 +9079,7 @@ no option http-pretend-keepalive This option may be set in backend and listen sections. Using it in a frontend section will be ignored and a warning will be reported during startup. It is a backend related option, so there is no real reason to set it on a - frontend. This option may be combined with "option httpclose", which will - cause keepalive to be announced to the server and close to be announced to - the client. This practice is discouraged though. + frontend. If this option has been enabled in a "defaults" section, it can be disabled in a specific instance by prepending the "no" keyword before it. @@ -9130,26 +9117,25 @@ option http-restrict-req-hdr-names { preserve | delete | reject } option http-server-close no option http-server-close - Enable or disable HTTP connection closing on the server side + Enable or disable HTTP/1.x connection closing on the server side May be used in sections : defaults | frontend | listen | backend yes | yes | yes | yes Arguments : none By default HAProxy operates in keep-alive mode with regards to persistent - connections: for each connection it processes each request and response, and - leaves the connection idle on both sides between the end of a response and - the start of a new request. This mode may be changed by several options such - as "option http-server-close" or "option httpclose". Setting "option - http-server-close" enables HTTP connection-close mode on the server side - while keeping the ability to support HTTP keep-alive and pipelining on the - client side. This provides the lowest latency on the client side (slow - network) and the fastest session reuse on the server side to save server - resources, similarly to "option httpclose". It also permits non-keepalive - capable servers to be served in keep-alive mode to the clients if they - conform to the requirements of RFC7230. Please note that some servers do not - always conform to those requirements when they see "Connection: close" in the - request. The effect will be that keep-alive will never be used. A workaround - consists in enabling "option http-pretend-keepalive". + HTTP/1.x connections: for each connection it processes each request and + response, and leaves the connection idle on both sides. This mode may be + changed by several options such as "option http-server-close" or "option + httpclose". Setting "option http-server-close" enables HTTP connection-close + mode on the server side while keeping the ability to support HTTP keep-alive + and pipelining on the client side. This provides the lowest latency on the + client side (slow network) and the fastest session reuse on the server side + to save server resources, similarly to "option httpclose". It also permits + non-keepalive capable servers to be served in keep-alive mode to the clients + if they conform to the requirements of RFC7230. Please note that some servers + do not always conform to those requirements when they see "Connection: close" + in the request. The effect will be that keep-alive will never be used. A + workaround consists in enabling "option http-pretend-keepalive". At the moment, logs will not indicate whether requests came from the same session or not. The accept date reported in the logs corresponds to the end @@ -9167,8 +9153,8 @@ no option http-server-close If this option has been enabled in a "defaults" section, it can be disabled in a specific instance by prepending the "no" keyword before it. - See also : "option httpclose", "option http-pretend-keepalive", - "option http-keep-alive", and "1.1. The HTTP transaction model". + See also : "option httpclose", "option http-pretend-keepalive" and + "option http-keep-alive". option http-use-proxy-header no option http-use-proxy-header @@ -9265,37 +9251,37 @@ option httpchk option httpclose no option httpclose - Enable or disable HTTP connection closing + Enable or disable HTTP/1.x connection closing May be used in sections : defaults | frontend | listen | backend yes | yes | yes | yes Arguments : none By default HAProxy operates in keep-alive mode with regards to persistent - connections: for each connection it processes each request and response, and - leaves the connection idle on both sides between the end of a response and - the start of a new request. This mode may be changed by several options such - as "option http-server-close" or "option httpclose". + HTTP/1.x connections: for each connection it processes each request and + response, and leaves the connection idle on both sides. This mode may be + changed by several options such as "option http-server-close" or "option + httpclose". - If "option httpclose" is set, HAProxy will close connections with the server - and the client as soon as the request and the response are received. It will - also check if a "Connection: close" header is already set in each direction, - and will add one if missing. Any "Connection" header different from "close" - will also be removed. + If "option httpclose" is set, HAProxy will close the client or the server + connection, depending where the option is set. Only the frontend is + considered for client connections while the frontend and the backend are + considered for server ones. In this case the option is enabled if at least + one of the frontend or backend holding the connection has it enabled. If the + option is set on a listener, it is applied both on client and server + connections. It will check if a "Connection: close" header is already set in + each direction, and will add one if missing. This option may also be combined with "option http-pretend-keepalive", which - will disable sending of the "Connection: close" header, but will still cause - the connection to be closed once the whole response is received. + will disable sending of the "Connection: close" request header, but will + still cause the connection to be closed once the whole response is received. - This option may be set both in a frontend and in a backend. It is enabled if - at least one of the frontend or backend holding a connection has it enabled. It disables and replaces any previous "option http-server-close" or "option - http-keep-alive". Please check section 4 ("Proxies") to see how this option - combines with others when frontend and backend options differ. + http-keep-alive". If this option has been enabled in a "defaults" section, it can be disabled in a specific instance by prepending the "no" keyword before it. - See also : "option http-server-close" and "1.1. The HTTP transaction model". + See also : "option http-server-close". option httplog [ clf ] From 45d23ba5fa974ddd18aa98b3a473ab2218304cf9 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Mon, 20 Feb 2023 14:33:46 +0100 Subject: [PATCH 057/140] DOC: config: Add the missing tune.fail-alloc option from global listing This global option is documented but it is not in the list of supported options for the global section. So let's add it. This patch could be backported to all stable versions. (cherry picked from commit 760a3841bdf0114809ae3fe2faf7996750cb41a6) Signed-off-by: Christopher Faulet (cherry picked from commit 6a66a61e67492a4c43573a7dc6556738cdf8720f) Signed-off-by: Willy Tarreau --- doc/configuration.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/configuration.txt b/doc/configuration.txt index 3bedddff6..b3f73f8fb 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -1107,6 +1107,7 @@ The following keywords are supported in the "global" section : - tune.buffers.reserve - tune.bufsize - tune.comp.maxlevel + - tune.fail-alloc - tune.fd.edge-triggered - tune.h2.header-table-size - tune.h2.initial-window-size From 365f227433a74134e59aaff065fa6b793a3820a0 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Mon, 27 Feb 2023 17:53:31 +0100 Subject: [PATCH 058/140] DOC: config: Clarify the meaning of 'hold' in the 'resolvers' section This patch improves the 'hold' parameter description in the 'resolvers' section to make it clearer. It really explains differences between all status. Thanks to Nick Ramirez for this update. This patch should solve the issue #1694. It could be backported to all stable versions. (cherry picked from commit 24b319b695aaaa4f2cdae741a623296dad2a5174) Signed-off-by: Christopher Faulet (cherry picked from commit b488f3d750620cf248f6409a4347da9e49fba1b1) Signed-off-by: Willy Tarreau --- doc/configuration.txt | 54 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index b3f73f8fb..9b1cf3980 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -15951,15 +15951,53 @@ parse-resolv-conf placed in the resolvers section in place of this directive. hold - Defines during which the last name resolution should be kept based - on last resolution - : last name resolution status. Acceptable values are "nx", - "other", "refused", "timeout", "valid", "obsolete". - : interval between two successive name resolution when the last - answer was in . It follows the HAProxy time format. - is in milliseconds by default. + Upon receiving the DNS response , determines whether a server's state + should change from UP to DOWN. To make that determination, it checks whether + any valid status has been received during the past in order to + counteract the just received invalid status. - Default value is 10s for "valid", 0s for "obsolete" and 30s for others. + : last name resolution status. + nx After receiving an NXDOMAIN status, check for any valid + status during the concluding period. + + refused After receiving a REFUSED status, check for any valid + status during the concluding period. + + timeout After the "timeout retry" has struck, check for any + valid status during the concluding period. + + other After receiving any other invalid status, check for any + valid status during the concluding period. + + valid Applies only to "http-request do-resolve" and + "tcp-request content do-resolve" actions. It defines the + period for which the server will maintain a valid response + before triggering another resolution. It does not affect + dynamic resolution of servers. + + obsolete Defines how long to wait before removing obsolete DNS + records after an updated answer record is received. It + applies to SRV records. + + : Amount of time into the past during which a valid response must + have been received. It follows the HAProxy time format and is in + milliseconds by default. + + For a server that relies on dynamic DNS resolution to determine its IP + address, receiving an invalid DNS response, such as NXDOMAIN, will lead to + changing the server's state from UP to DOWN. The hold directives define how + far into the past to look for a valid response. If a valid response has been + received within , the just received invalid status will be ignored. + + Unless a valid response has been receiving during the concluding period, the + server will be marked as DOWN. For example, if "hold nx 30s" is set and the + last received DNS response was NXDOMAIN, the server will be marked DOWN + unless a valid response has been received during the last 30 seconds. + + A server in the DOWN state will be marked UP immediately upon receiving a + valid status from the DNS server. + + A separate behavior exists for "hold valid" and "hold obsolete". resolve_retries Defines the number of queries to send to resolve a server name before From d7be206d3570138cfadca87bb768293804629bc7 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 28 Feb 2023 15:39:38 +0100 Subject: [PATCH 059/140] BUG/MEDIUM: connection: Clear flags when a conn is removed from an idle list When a connection is removed from the safe list or the idle list, CO_FL_SAFE_LIST and CO_FL_IDLE_LIST flags must be cleared. It is performed when the connection is reused. But not when it is moved into the toremove_conns list. It may be an issue because the multiplexer owning the connection may be woken up before the connection is really removed. If the connection flags are not sanitized, it may think the connection is idle and reinsert it in the corresponding list. From this point, we can imagine several bugs. An UAF or a connection reused with an invalid state for instance. To avoid any issue, the connection flags are sanitized when an idle connection is moved into the toremove_conns list. The same is performed at right places in the multiplexers. Especially because the connection release may be delayed (for h2 and fcgi connections). This patch shoudld fix the issue #2057. It must carefully be backported as far as 2.2. Especially on the 2.2 where the code is really different. But some conflicts should be expected on the 2.4 too. (cherry picked from commit 5e1b0e7bf86a300def07388df0ea7f4b3f9e68b9) Signed-off-by: Christopher Faulet (cherry picked from commit 7902ebadb1ffbe0237ce974b950ca595894f3774) Signed-off-by: Willy Tarreau --- src/mux_fcgi.c | 4 +++- src/mux_h1.c | 4 +++- src/mux_h2.c | 7 ++++++- src/server.c | 2 ++ 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/mux_fcgi.c b/src/mux_fcgi.c index 31bc3ad0c..4981f6bab 100644 --- a/src/mux_fcgi.c +++ b/src/mux_fcgi.c @@ -3227,8 +3227,10 @@ struct task *fcgi_timeout_task(struct task *t, void *context, unsigned int state /* We're about to destroy the connection, so make sure nobody attempts * to steal it from us. */ - if (fconn->conn->flags & CO_FL_LIST_MASK) + if (fconn->conn->flags & CO_FL_LIST_MASK) { conn_delete_from_tree(&fconn->conn->hash_node->node); + fconn->conn->flags &= ~CO_FL_LIST_MASK; + } HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } diff --git a/src/mux_h1.c b/src/mux_h1.c index 754b57208..56b08a77e 100644 --- a/src/mux_h1.c +++ b/src/mux_h1.c @@ -3282,8 +3282,10 @@ struct task *h1_timeout_task(struct task *t, void *context, unsigned int state) /* We're about to destroy the connection, so make sure nobody attempts * to steal it from us. */ - if (h1c->conn->flags & CO_FL_LIST_MASK) + if (h1c->conn->flags & CO_FL_LIST_MASK) { conn_delete_from_tree(&h1c->conn->hash_node->node); + h1c->conn->flags &= ~CO_FL_LIST_MASK; + } HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } diff --git a/src/mux_h2.c b/src/mux_h2.c index 94017e9bc..f4cb5b188 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -4163,6 +4163,7 @@ static int h2_process(struct h2c *h2c) if (conn->flags & CO_FL_LIST_MASK) { HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); conn_delete_from_tree(&conn->hash_node->node); + conn->flags &= ~CO_FL_LIST_MASK; HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } } @@ -4171,6 +4172,7 @@ static int h2_process(struct h2c *h2c) if (conn->flags & CO_FL_LIST_MASK) { HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); conn_delete_from_tree(&conn->hash_node->node); + conn->flags &= ~CO_FL_LIST_MASK; HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } } @@ -4251,8 +4253,10 @@ struct task *h2_timeout_task(struct task *t, void *context, unsigned int state) /* We're about to destroy the connection, so make sure nobody attempts * to steal it from us. */ - if (h2c->conn->flags & CO_FL_LIST_MASK) + if (h2c->conn->flags & CO_FL_LIST_MASK) { conn_delete_from_tree(&h2c->conn->hash_node->node); + h2c->conn->flags &= ~CO_FL_LIST_MASK; + } HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } @@ -4305,6 +4309,7 @@ do_leave: if (h2c->conn->flags & CO_FL_LIST_MASK) { HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); conn_delete_from_tree(&h2c->conn->hash_node->node); + h2c->conn->flags &= ~CO_FL_LIST_MASK; HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } diff --git a/src/server.c b/src/server.c index cccf2f5a5..8a282bcf9 100644 --- a/src/server.c +++ b/src/server.c @@ -5717,6 +5717,7 @@ static int srv_migrate_conns_to_remove(struct eb_root *idle_tree, struct mt_list hash_node = ebmb_entry(node, struct conn_hash_node, node); eb_delete(node); + hash_node->conn->flags &= ~CO_FL_LIST_MASK; MT_LIST_APPEND(toremove_list, &hash_node->conn->toremove_list); i++; @@ -5774,6 +5775,7 @@ void srv_release_conn(struct server *srv, struct connection *conn) /* Remove the connection from any tree (safe, idle or available) */ HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); conn_delete_from_tree(&conn->hash_node->node); + conn->flags &= ~CO_FL_LIST_MASK; HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } From e28170bfcc885e3bcea98314d22236b3781309e7 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 28 Feb 2023 18:44:14 +0100 Subject: [PATCH 060/140] BUG/MINOR: http-check: Don't set HTX_SL_F_BODYLESS flag with a log-format body When the HTTP request of a health-check is forged, we must not pretend there is no payload, by setting HTX_SL_F_BODYLESS, if a log-format body was configured. Indeed, a test on the body length was used but it is only valid for a plain string. For A log-format string, a list is used. Note it an bug with no consequence for now. This patch must be backported as far as 2.2. (cherry picked from commit 0506d9de512291ed2526654800b98a2317a67b6d) Signed-off-by: Christopher Faulet (cherry picked from commit dd931360c79520aef02394a8027e67cea975b4a2) Signed-off-by: Willy Tarreau --- src/tcpcheck.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tcpcheck.c b/src/tcpcheck.c index c77e3ad19..dfda77fc6 100644 --- a/src/tcpcheck.c +++ b/src/tcpcheck.c @@ -1412,7 +1412,7 @@ enum tcpcheck_eval_ret tcpcheck_eval_send(struct check *check, struct tcpcheck_r (istlen(vsn) == 8 && (*(vsn.ptr+5) > '1' || (*(vsn.ptr+5) == '1' && *(vsn.ptr+7) >= '1')))) slflags |= HTX_SL_F_VER_11; slflags |= (HTX_SL_F_XFER_LEN|HTX_SL_F_CLEN); - if (!isttest(send->http.body)) + if (!(send->http.flags & TCPCHK_SND_HTTP_FL_BODY_FMT) && !isttest(send->http.body)) slflags |= HTX_SL_F_BODYLESS; htx = htx_from_buf(&check->bo); From 45b9f344974733da888c9ad668ed28b02544fcef Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 28 Feb 2023 18:51:26 +0100 Subject: [PATCH 061/140] BUG/MINOR: http-check: Skip C-L header for empty body when it's not mandatory The Content-Length header is always added into the request for an HTTP health-check. However, when there is no payload, this header may be skipped for OPTIONS, GET, HEAD and DELETE methods. In fact, it is a "SHOULD NOT" in the RCF 9110 (#8.6). It is not really an issue in itself but it seems to be an issue for AWS ELB. It returns a 400-Bad-Request if a HEAD/GET request with no payload contains a Content-Length header. So, it is better to skip this header when possible. This patch should fix the issue #2026. It could be backported as far as 2.2. (cherry picked from commit d48bfb6983a3d28183b068a4f8975c1c5cd05978) Signed-off-by: Christopher Faulet (cherry picked from commit fd5ba13e9892926ac0dd14b9dbf3e827af3b6f18) Signed-off-by: Willy Tarreau --- src/tcpcheck.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/tcpcheck.c b/src/tcpcheck.c index dfda77fc6..6441d34c0 100644 --- a/src/tcpcheck.c +++ b/src/tcpcheck.c @@ -1459,12 +1459,18 @@ enum tcpcheck_eval_ret tcpcheck_eval_send(struct check *check, struct tcpcheck_r } else body = send->http.body; - clen = ist((!istlen(body) ? "0" : ultoa(istlen(body)))); - if ((!connection_hdr && !htx_add_header(htx, ist("Connection"), ist("close"))) || - !htx_add_header(htx, ist("Content-length"), clen)) + if (!connection_hdr && !htx_add_header(htx, ist("Connection"), ist("close"))) goto error_htx; + if ((send->http.meth.meth != HTTP_METH_OPTIONS && + send->http.meth.meth != HTTP_METH_GET && + send->http.meth.meth != HTTP_METH_HEAD && + send->http.meth.meth != HTTP_METH_DELETE) || istlen(body)) { + clen = ist((!istlen(body) ? "0" : ultoa(istlen(body)))); + if (!htx_add_header(htx, ist("Content-length"), clen)) + goto error_htx; + } if (!htx_add_endof(htx, HTX_BLK_EOH) || (istlen(body) && !htx_add_data_atonce(htx, body))) From 2e44aa9a377d4404a33693b2e1d113a820469724 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Wed, 1 Mar 2023 15:45:39 +0100 Subject: [PATCH 062/140] BUG/MINOR: http-ana: Don't increment conn_retries counter before the L7 retry When we are about to perform a L7 retry, we deal with the conn_retries counter, to be sure we can retry. However, there is an issue here because the counter is incremented before it is checked against the backend limit. So, we can miss a connection retry. Of course, we must invert both operation. The conn_retries counter must be incremented after the check agains the backend limit. This patch must be backported as far as 2.6. (cherry picked from commit 41ade746c727596d5ac6fa10446be25c19d39362) Signed-off-by: Christopher Faulet (cherry picked from commit f50337c61d0791fcb7ffce2da1ab11cfeac1e692) Signed-off-by: Willy Tarreau --- src/http_ana.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/http_ana.c b/src/http_ana.c index 13f9641bd..90aa65ee8 100644 --- a/src/http_ana.c +++ b/src/http_ana.c @@ -1258,10 +1258,9 @@ static __inline int do_l7_retry(struct stream *s, struct stconn *sc) struct channel *req, *res; int co_data; - s->conn_retries++; if (s->conn_retries >= s->be->conn_retries) return -1; - + s->conn_retries++; if (objt_server(s->target)) { if (s->flags & SF_CURR_SESS) { s->flags &= ~SF_CURR_SESS; From bd65ed0214654fc9b489809a1372facca250e7f8 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Wed, 1 Mar 2023 15:47:18 +0100 Subject: [PATCH 063/140] BUG/MINOR: http-ana: Do a L7 retry on read error if there is no response A regression about "empty-response" L7 retry was introduced with the commit dd6496f591 ("CLEANUP: http-ana: Remove useless if statement about L7 retries"). The if statetement was removed on a wrong assumption. Indeed, L7 retries on status is now handled in the HTTP analysers. Thus, the stream-connector (formely the conn-stream, and before again the stream-interface) no longer report a read error to force a retry. But it is still possible to get a read error with no response. In this case, we must perform a retry is "empty-response" is enabled. So the if statement is re-introduced, reverting the cleanup. This patch should fix the issue #2061. It must be backported as far as 2.4. (cherry picked from commit 6f78ac56059ac00265197c04b801a819dd730d8e) Signed-off-by: Christopher Faulet (cherry picked from commit d8dba50ac789abdc068c0232bf49cb9ad935a881) Signed-off-by: Willy Tarreau --- src/http_ana.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/http_ana.c b/src/http_ana.c index 90aa65ee8..273fe16d0 100644 --- a/src/http_ana.c +++ b/src/http_ana.c @@ -1358,7 +1358,15 @@ int http_wait_for_response(struct stream *s, struct channel *rep, int an_bit) if (rep->flags & CF_READ_ERROR) { struct connection *conn = sc_conn(s->scb); - /* Perform a L7 retry because server refuses the early data. */ + + if ((txn->flags & TX_L7_RETRY) && + (s->be->retry_type & PR_RE_DISCONNECTED) && + (!conn || conn->err_code != CO_ER_SSL_EARLY_FAILED)) { + if (co_data(rep) || do_l7_retry(s, s->scb) == 0) + return 0; + } + + /* Perform a L7 retry on empty response or because server refuses the early data. */ if ((txn->flags & TX_L7_RETRY) && (s->be->retry_type & PR_RE_EARLY_ERROR) && conn && conn->err_code == CO_ER_SSL_EARLY_FAILED && From 3109c7b1187c3a1a85382acfa16c6577a0d1d352 Mon Sep 17 00:00:00 2001 From: Remi Tricot-Le Breton Date: Thu, 2 Mar 2023 15:49:55 +0100 Subject: [PATCH 064/140] BUG/MINOR: ssl: Use 'date' instead of 'now' in ocsp stapling callback In the OCSP response callback, instead of using the actual date of the system, the scheduler's 'now' timer is used when checking a response's validity. This patch can be backported to all stable versions. (cherry picked from commit 8c20a74c90964b7bd76144fe8dbde15f227d9a04) [cf: applied in src/ssl_sock.c] Signed-off-by: Christopher Faulet (cherry picked from commit dbec9cad895ff5fe00470dceda1d1e3abeeeae2c) Signed-off-by: Willy Tarreau --- src/ssl_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ssl_sock.c b/src/ssl_sock.c index 0e2569f80..2c47fcb00 100644 --- a/src/ssl_sock.c +++ b/src/ssl_sock.c @@ -1422,7 +1422,7 @@ int ssl_sock_ocsp_stapling_cbk(SSL *ssl, void *arg) if (!ocsp || !ocsp->response.area || !ocsp->response.data || - (ocsp->expire < now.tv_sec)) + (ocsp->expire < date.tv_sec)) return SSL_TLSEXT_ERR_NOACK; ssl_buf = OPENSSL_malloc(ocsp->response.data); From becf3a50e745232f168a8ceb08a0df71f510cfb2 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Mon, 13 Feb 2023 10:58:13 +0100 Subject: [PATCH 065/140] MINOR: ssl: rename confusing ssl_bind_kws The ssl_bind_kw structure is exclusively used for crt-list keyword, it must be named otherwise to remove the confusion. The structure was renamed ssl_crtlist_kws. (cherry picked from commit af678066518ea5569005b5e43c140a8facb2ee61) Signed-off-by: William Lallemand (cherry picked from commit 30b1d8b63f8ed09ab2b2b7bcb4d6b4ba5f4f45e8) Signed-off-by: Willy Tarreau --- include/haproxy/listener-t.h | 4 +++- include/haproxy/ssl_sock.h | 2 +- src/cfgparse-ssl.c | 4 ++-- src/cfgparse.c | 10 +++++----- src/ssl_crtlist.c | 10 +++++----- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/include/haproxy/listener-t.h b/include/haproxy/listener-t.h index 46921a4b9..9928ebd62 100644 --- a/include/haproxy/listener-t.h +++ b/include/haproxy/listener-t.h @@ -279,7 +279,9 @@ struct bind_kw { int (*parse)(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err); int skip; /* nb of args to skip */ }; -struct ssl_bind_kw { + +/* same as bind_kw but for crtlist keywords */ +struct ssl_crtlist_kw { const char *kw; int (*parse)(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err); int skip; /* nb of args to skip */ diff --git a/include/haproxy/ssl_sock.h b/include/haproxy/ssl_sock.h index 583266247..87ae38cfc 100644 --- a/include/haproxy/ssl_sock.h +++ b/include/haproxy/ssl_sock.h @@ -37,7 +37,7 @@ extern struct eb_root crtlists_tree; extern struct eb_root cafile_tree; extern int sctl_ex_index; extern struct global_ssl global_ssl; -extern struct ssl_bind_kw ssl_bind_kws[]; +extern struct ssl_crtlist_kw ssl_crtlist_kws[]; extern struct methodVersions methodVersions[]; __decl_thread(extern HA_SPINLOCK_T ckch_lock); extern struct pool_head *pool_head_ssl_capture; diff --git a/src/cfgparse-ssl.c b/src/cfgparse-ssl.c index 35780adff..63b41bfa9 100644 --- a/src/cfgparse-ssl.c +++ b/src/cfgparse-ssl.c @@ -1858,9 +1858,9 @@ static int ssl_parse_skip_self_issued_ca(char **args, int section_type, struct p * not enabled. */ -/* the keywords are used for crt-list parsing, they *MUST* be safe +/* the keywords are used for crt-list parsing, they *MUST* be safe * with their proxy argument NULL and must only fill the ssl_bind_conf */ -struct ssl_bind_kw ssl_bind_kws[] = { +struct ssl_crtlist_kw ssl_crtlist_kws[] = { { "allow-0rtt", ssl_bind_parse_allow_0rtt, 0 }, /* allow 0-RTT */ { "alpn", ssl_bind_parse_alpn, 1 }, /* set ALPN supported protocols */ { "ca-file", ssl_bind_parse_ca_file, 1 }, /* set CAfile to process ca-names and verify on client cert */ diff --git a/src/cfgparse.c b/src/cfgparse.c index da6127575..fa9945b0c 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -4409,13 +4409,13 @@ void cfg_dump_registered_keywords() extern struct list tcp_req_conn_keywords, tcp_req_sess_keywords, tcp_req_cont_keywords, tcp_res_cont_keywords; extern struct bind_kw_list bind_keywords; - extern struct ssl_bind_kw ssl_bind_kws[] __maybe_unused; + extern struct ssl_crtlist_kw ssl_crtlist_kws[] __maybe_unused; extern struct srv_kw_list srv_keywords; struct bind_kw_list *bkwl; struct srv_kw_list *skwl; const struct bind_kw *bkwp, *bkwn; const struct srv_kw *skwp, *skwn; - const struct ssl_bind_kw *sbkwp __maybe_unused, *sbkwn __maybe_unused; + const struct ssl_crtlist_kw *sbkwp __maybe_unused, *sbkwn __maybe_unused; const struct cfg_opt *coptp, *coptn; for (bkwn = bkwp = NULL;; bkwp = bkwn) { @@ -4437,11 +4437,11 @@ void cfg_dump_registered_keywords() #if defined(USE_OPENSSL) for (sbkwn = sbkwp = NULL;; sbkwp = sbkwn) { - for (index = 0; ssl_bind_kws[index].kw != NULL; index++) { + for (index = 0; ssl_crtlist_kws[index].kw != NULL; index++) { if (strordered(sbkwp ? sbkwp->kw : NULL, - ssl_bind_kws[index].kw, + ssl_crtlist_kws[index].kw, sbkwn != sbkwp ? sbkwn->kw : NULL)) - sbkwn = &ssl_bind_kws[index]; + sbkwn = &ssl_crtlist_kws[index]; } if (sbkwn == sbkwp) break; diff --git a/src/ssl_crtlist.c b/src/ssl_crtlist.c index 22fe54228..ab6b262fe 100644 --- a/src/ssl_crtlist.c +++ b/src/ssl_crtlist.c @@ -419,17 +419,17 @@ int crtlist_parse_line(char *line, char **crt_path, struct crtlist_entry *entry, cur_arg = ssl_b ? ssl_b : 1; while (cur_arg < ssl_e) { newarg = 0; - for (i = 0; ssl_bind_kws[i].kw != NULL; i++) { - if (strcmp(ssl_bind_kws[i].kw, args[cur_arg]) == 0) { + for (i = 0; ssl_crtlist_kws[i].kw != NULL; i++) { + if (strcmp(ssl_crtlist_kws[i].kw, args[cur_arg]) == 0) { newarg = 1; - cfgerr |= ssl_bind_kws[i].parse(args, cur_arg, NULL, ssl_conf, from_cli, err); - if (cur_arg + 1 + ssl_bind_kws[i].skip > ssl_e) { + cfgerr |= ssl_crtlist_kws[i].parse(args, cur_arg, NULL, ssl_conf, from_cli, err); + if (cur_arg + 1 + ssl_crtlist_kws[i].skip > ssl_e) { memprintf(err, "parsing [%s:%d]: ssl args out of '[]' for %s", file, linenum, args[cur_arg]); cfgerr |= ERR_ALERT | ERR_FATAL; goto error; } - cur_arg += 1 + ssl_bind_kws[i].skip; + cur_arg += 1 + ssl_crtlist_kws[i].skip; break; } } From 8c5ff3e8d01d78bbb2341ab3a47a9eaddbf969e5 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Mon, 13 Feb 2023 15:24:01 +0100 Subject: [PATCH 066/140] BUG/MINOR: config: crt-list keywords mistaken for bind ssl keywords This patch fixes an issue in the "-dK" keywords dumper, which was mistakenly displaying the "crt-list" keywords for "bind ssl" keywords. The patch fixes the issue by dumping the "crt-list" keywords in its own section, and dumping the "bind" keywords which are in the "SSL" scope with a "bind ssl" prefix. This commit depends on the previous "MINOR: ssl: rename confusing ssl_bind_kws" commit. Must be backported in 2.6. Diff of the `./haproxy -dKall -q -c -f /dev/null` output before and after the patch in 2.8-dev4: | @@ -190,30 +190,9 @@ listen | use-fcgi-app | bind accept-netscaler-cip +1 | bind accept-proxy | - bind allow-0rtt | - bind alpn +1 | bind backlog +1 | - bind ca-file +1 | - bind ca-ignore-err +1 | - bind ca-sign-file +1 | - bind ca-sign-pass +1 | - bind ca-verify-file +1 | - bind ciphers +1 | - bind ciphersuites +1 | - bind crl-file +1 | - bind crt +1 | - bind crt-ignore-err +1 | - bind crt-list +1 | - bind curves +1 | bind defer-accept | - bind ecdhe +1 | bind expose-fd +1 | - bind force-sslv3 | - bind force-tlsv10 | - bind force-tlsv11 | - bind force-tlsv12 | - bind force-tlsv13 | - bind generate-certificates | bind gid +1 | bind group +1 | bind id +1 | @@ -225,48 +204,52 @@ listen | bind name +1 | bind namespace +1 | bind nice +1 | - bind no-ca-names | - bind no-sslv3 | - bind no-tls-tickets | - bind no-tlsv10 | - bind no-tlsv11 | - bind no-tlsv12 | - bind no-tlsv13 | - bind npn +1 | - bind prefer-client-ciphers | bind process +1 | bind proto +1 | bind severity-output +1 | bind shards +1 | - bind ssl | - bind ssl-max-ver +1 | - bind ssl-min-ver +1 | - bind strict-sni | bind tcp-ut +1 | bind tfo | bind thread +1 | - bind tls-ticket-keys +1 | bind transparent | bind uid +1 | bind user +1 | bind v4v6 | bind v6only | - bind verify +1 | bind ssl allow-0rtt | bind ssl alpn +1 | bind ssl ca-file +1 | + bind ssl ca-ignore-err +1 | + bind ssl ca-sign-file +1 | + bind ssl ca-sign-pass +1 | bind ssl ca-verify-file +1 | bind ssl ciphers +1 | bind ssl ciphersuites +1 | bind ssl crl-file +1 | + bind ssl crt +1 | + bind ssl crt-ignore-err +1 | + bind ssl crt-list +1 | bind ssl curves +1 | bind ssl ecdhe +1 | + bind ssl force-sslv3 | + bind ssl force-tlsv10 | + bind ssl force-tlsv11 | + bind ssl force-tlsv12 | + bind ssl force-tlsv13 | + bind ssl generate-certificates | bind ssl no-ca-names | + bind ssl no-sslv3 | + bind ssl no-tls-tickets | + bind ssl no-tlsv10 | + bind ssl no-tlsv11 | + bind ssl no-tlsv12 | + bind ssl no-tlsv13 | bind ssl npn +1 | - bind ssl ocsp-update +1 | + bind ssl prefer-client-ciphers | bind ssl ssl-max-ver +1 | bind ssl ssl-min-ver +1 | + bind ssl strict-sni | + bind ssl tls-ticket-keys +1 | bind ssl verify +1 | server addr +1 | server agent-addr +1 | @@ -591,6 +574,23 @@ listen | http-after-response unset-var* | userlist | peers | +crt-list | + allow-0rtt | + alpn +1 | + ca-file +1 | + ca-verify-file +1 | + ciphers +1 | + ciphersuites +1 | + crl-file +1 | + curves +1 | + ecdhe +1 | + no-ca-names | + npn +1 | + ocsp-update +1 | + ssl-max-ver +1 | + ssl-min-ver +1 | + verify +1 | # List of registered CLI keywords: | @! [MASTER] | @ [MASTER] (cherry picked from commit 44979ad680c1abcb33b2a2b2308bd3164f1f9465) Signed-off-by: William Lallemand (cherry picked from commit 8f7638af02414df9b58e8eea524c22793a3778d7) Signed-off-by: Willy Tarreau --- include/haproxy/cfgparse.h | 1 + src/cfgparse.c | 65 ++++++++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/include/haproxy/cfgparse.h b/include/haproxy/cfgparse.h index 7c3d128bf..e283af36b 100644 --- a/include/haproxy/cfgparse.h +++ b/include/haproxy/cfgparse.h @@ -35,6 +35,7 @@ struct acl_cond; #define CFG_LISTEN 2 #define CFG_USERLIST 3 #define CFG_PEERS 4 +#define CFG_CRTLIST 5 /* various keyword modifiers */ enum kw_mod { diff --git a/src/cfgparse.c b/src/cfgparse.c index fa9945b0c..c4b0056b5 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -4382,7 +4382,8 @@ void cfg_restore_sections(struct list *backup_sections) /* dumps all registered keywords by section on stdout */ void cfg_dump_registered_keywords() { - const char* sect_names[] = { "", "global", "listen", "userlist", "peers", 0 }; + /* CFG_GLOBAL, CFG_LISTEN, CFG_USERLIST, CFG_PEERS, CFG_CRTLIST */ + const char* sect_names[] = { "", "global", "listen", "userlist", "peers", "crt-list", 0 }; int section; int index; @@ -4409,22 +4410,24 @@ void cfg_dump_registered_keywords() extern struct list tcp_req_conn_keywords, tcp_req_sess_keywords, tcp_req_cont_keywords, tcp_res_cont_keywords; extern struct bind_kw_list bind_keywords; - extern struct ssl_crtlist_kw ssl_crtlist_kws[] __maybe_unused; extern struct srv_kw_list srv_keywords; struct bind_kw_list *bkwl; struct srv_kw_list *skwl; const struct bind_kw *bkwp, *bkwn; const struct srv_kw *skwp, *skwn; - const struct ssl_crtlist_kw *sbkwp __maybe_unused, *sbkwn __maybe_unused; const struct cfg_opt *coptp, *coptn; + /* display the non-ssl keywords */ for (bkwn = bkwp = NULL;; bkwp = bkwn) { list_for_each_entry(bkwl, &bind_keywords.list, list) { - for (index = 0; bkwl->kw[index].kw != NULL; index++) + if (strcmp(bkwl->scope, "SSL") == 0) /* skip SSL keywords */ + continue; + for (index = 0; bkwl->kw[index].kw != NULL; index++) { if (strordered(bkwp ? bkwp->kw : NULL, bkwl->kw[index].kw, bkwn != bkwp ? bkwn->kw : NULL)) bkwn = &bkwl->kw[index]; + } } if (bkwn == bkwp) break; @@ -4434,24 +4437,31 @@ void cfg_dump_registered_keywords() else printf("\tbind %s +%d\n", bkwn->kw, bkwn->skip); } - #if defined(USE_OPENSSL) - for (sbkwn = sbkwp = NULL;; sbkwp = sbkwn) { - for (index = 0; ssl_crtlist_kws[index].kw != NULL; index++) { - if (strordered(sbkwp ? sbkwp->kw : NULL, - ssl_crtlist_kws[index].kw, - sbkwn != sbkwp ? sbkwn->kw : NULL)) - sbkwn = &ssl_crtlist_kws[index]; + /* displays the "ssl" keywords */ + for (bkwn = bkwp = NULL;; bkwp = bkwn) { + list_for_each_entry(bkwl, &bind_keywords.list, list) { + if (strcmp(bkwl->scope, "SSL") != 0) /* skip non-SSL keywords */ + continue; + for (index = 0; bkwl->kw[index].kw != NULL; index++) { + if (strordered(bkwp ? bkwp->kw : NULL, + bkwl->kw[index].kw, + bkwn != bkwp ? bkwn->kw : NULL)) + bkwn = &bkwl->kw[index]; + } } - if (sbkwn == sbkwp) + if (bkwn == bkwp) break; - if (!sbkwn->skip) - printf("\tbind ssl %s\n", sbkwn->kw); + + if (strcmp(bkwn->kw, "ssl") == 0) /* skip "bind ssl ssl" */ + continue; + + if (!bkwn->skip) + printf("\tbind ssl %s\n", bkwn->kw); else - printf("\tbind ssl %s +%d\n", sbkwn->kw, sbkwn->skip); + printf("\tbind ssl %s +%d\n", bkwn->kw, bkwn->skip); } #endif - for (skwn = skwp = NULL;; skwp = skwn) { list_for_each_entry(skwl, &srv_keywords.list, list) { for (index = 0; skwl->kw[index].kw != NULL; index++) @@ -4502,6 +4512,29 @@ void cfg_dump_registered_keywords() dump_act_rules(&http_res_keywords.list, "\thttp-response "); dump_act_rules(&http_after_res_keywords.list, "\thttp-after-response "); } + if (section == CFG_CRTLIST) { + /* displays the keyword available for the crt-lists */ + extern struct ssl_crtlist_kw ssl_crtlist_kws[] __maybe_unused; + const struct ssl_crtlist_kw *sbkwp __maybe_unused, *sbkwn __maybe_unused; + +#if defined(USE_OPENSSL) + for (sbkwn = sbkwp = NULL;; sbkwp = sbkwn) { + for (index = 0; ssl_crtlist_kws[index].kw != NULL; index++) { + if (strordered(sbkwp ? sbkwp->kw : NULL, + ssl_crtlist_kws[index].kw, + sbkwn != sbkwp ? sbkwn->kw : NULL)) + sbkwn = &ssl_crtlist_kws[index]; + } + if (sbkwn == sbkwp) + break; + if (!sbkwn->skip) + printf("\t%s\n", sbkwn->kw); + else + printf("\t%s +%d\n", sbkwn->kw, sbkwn->skip); + } +#endif + + } } } From 18521f34fd340db7eb2a71b350e5fe1dba8674b9 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 9 Mar 2023 10:12:06 +0100 Subject: [PATCH 067/140] BUG/MINOR: init: properly detect NUMA bindings on large systems The NUMA detection code tries not to interfer with any taskset the user could have specified in init scripts. For this it compares the number of CPUs available with the number the process is bound to. However, the CPU count is retrieved after being applied an upper bound of MAX_THREADS, so if the machine has more than 64 CPUs, the comparison always fails and makes haproxy think the user has already enforced a binding, and it does not pin it anymore to a single NUMA node. This can be verified by issuing: $ socat /path/to/sock - <<< "show info" | grep thread On a dual 48-CPU machine it reports 64, implying that threads are allowed to run on the second socket: Nbthread: 64 With this fix, the function properly reports 96, and the output shows 48, indicating that a single NUMA node was used: Nbthread: 48 Of course nothing is changed when "no numa-cpu-mapping" is specified: Nbthread: 64 This can be backported to 2.4. (cherry picked from commit f5b63277f416376b54276d4f0f9ea7999525180e) Signed-off-by: Willy Tarreau (cherry picked from commit b5ed7e86ca5bbceb6440f5dba8e88513f90f8c0c) Signed-off-by: Willy Tarreau --- src/thread.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/thread.c b/src/thread.c index eb8e798c8..c5dd12324 100644 --- a/src/thread.c +++ b/src/thread.c @@ -345,7 +345,9 @@ void ha_rwlock_init(HA_RWLOCK_T *l) HA_RWLOCK_INIT(l); } -/* returns the number of CPUs the current process is enabled to run on */ +/* returns the number of CPUs the current process is enabled to run on, + * regardless of any MAX_THREADS limitation. + */ static int thread_cpus_enabled() { int ret = 1; @@ -366,7 +368,6 @@ static int thread_cpus_enabled() #endif #endif ret = MAX(ret, 1); - ret = MIN(ret, MAX_THREADS); return ret; } @@ -964,6 +965,7 @@ static void __thread_init(void) preload_libgcc_s(); thread_cpus_enabled_at_boot = thread_cpus_enabled(); + thread_cpus_enabled_at_boot = MIN(thread_cpus_enabled_at_boot, MAX_THREADS); memprintf(&ptr, "Built with multi-threading support (MAX_THREADS=%d, default=%d).", MAX_THREADS, thread_cpus_enabled_at_boot); From 4e6c02fda6321174381e51ea7117e7ba6e38f5f6 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 22 Jul 2022 17:35:49 +0200 Subject: [PATCH 068/140] BUG/MEDIUM: master: force the thread count earlier Christopher bisected that recent commit d0b73bca71 ("MEDIUM: listener: switch bind_thread from global to group-local") broke the master socket in that only the first out of the Nth initial connections would work, where N is the number of threads, after which they all work. The cause is that the master socket was bound to multiple threads, despite global.nbthread being 1 there, so the incoming connection load balancing would try to send incoming connections to non-existing threads, however the bind_thread mask would nonetheless include multiple threads. What happened is that in 1.9 we forced "nbthread" to 1 in the master's poll loop with commit b3f2be338b ("MEDIUM: mworker: use the haproxy poll loop"). In 2.0, nbthread detection was enabled by default in commit 149ab779cc ("MAJOR: threads: enable one thread per CPU by default"). From this point on, the operation above is unsafe because everything during startup is performed with nbthread corresponding to the default value, then it changes to one when starting the polling loop. But by then we weren't using the wait mode except for reload errors, so even if it would have happened nobody would have noticed. In 2.5 with commit fab0fdce9 ("MEDIUM: mworker: reexec in waitpid mode after successful loading") we started to rexecute all the time, not just for errors, so as to release precious resources and to possibly spot bugs that were rarely exposed in this mode. By then the incoming connection LB was enforcing all_threads_mask on the listener's thread mask so that the incorrect value was being corrected while using it. Finally in 2.7 commit d0b73bca71 ("MEDIUM: listener: switch bind_thread from global to group-local") replaces the all_threads_mask there with the listener's bind_thread, but that one was never adjusted by the starting master, whose thread group was filled to N threads by the automatic detection during early setup. The best approach here is to set nbthread to 1 very early in init() when we're in the master in wait mode, so that we don't try to guess the best value and don't end up with incorrect bindings anymore. This patch does this and also sets nbtgroups to 1 in preparation for a possible future where this will also be automatically calculated. There is no need to backport this patch since no other versions were affected, but if it were to be discovered that the incorrect bind mask on some of the master's FDs could be responsible for any trouble in older versions, then the backport should be safe (provided that nbtgroups is dropped of course). (cherry picked from commit 53bfac8c632dd77a199194f70062bfd86270d400) [wt: this fixes "no numa-cpu-mapping" which is lost for the master past the wait mode, hence ignored for workers. The option was added in 2.4, not sure whether we want to backport that far. For 2.6, we set tid_bit and all_threads_mask] Signed-off-by: Willy Tarreau --- src/haproxy.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/haproxy.c b/src/haproxy.c index 0cb0662d2..4fcc74963 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -837,13 +837,6 @@ static void mworker_loop() mworker_catch_sigchld(NULL); /* ensure we clean the children in case some SIGCHLD were lost */ - global.nbthread = 1; - -#ifdef USE_THREAD - tid_bit = 1; - all_threads_mask = 1; -#endif - jobs++; /* this is the "master" job, we want to take care of the signals even if there is no listener so the poll loop don't leave */ @@ -2115,6 +2108,19 @@ static void init(int argc, char **argv) LIST_APPEND(&proc_list, &tmproc->list); } + + if (global.mode & MODE_MWORKER_WAIT) { + /* in exec mode, there's always exactly one thread. Failure to + * set these ones now will result in nbthread being detected + * automatically. + */ + global.nbthread = 1; +#ifdef USE_THREAD + tid_bit = 1; + all_threads_mask = 1; +#endif + } + if (global.mode & (MODE_MWORKER|MODE_MWORKER_WAIT)) { struct wordlist *it, *c; From e9485a5b8ebe63619db84d56e7681f371772fd47 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 9 Mar 2023 11:47:55 +0100 Subject: [PATCH 069/140] BUG/MINOR: init: make sure to always limit the total number of threads Commit 18521f34fd ("BUG/MINOR: init: properly detect NUMA bindings on large systems") revealed another interesting issue which is that we can now fail to start if we have more than 64 threads on a single NUMA node. 2.7 and newer have an explicit test for this and will issue a diag message indicating this limitation, but 2.6 did not. Let's integrate the same mechanism so that we always start but print the hard limit. This will have to be backported to the same versions where the patch above is backported. --- src/cfgparse.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cfgparse.c b/src/cfgparse.c index c4b0056b5..89c715d1d 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -2499,6 +2499,12 @@ int check_config_validity() #endif global.nbthread = numa_cores ? numa_cores : thread_cpus_enabled_at_boot; + + if (global.nbthread > MAX_THREADS) { + ha_diag_warning("nbthread not set, found %d CPUs, limiting to %d threads. Please set nbthreads in the global section to silence this warning.\n", + global.nbthread, MAX_THREADS); + global.nbthread = MAX_THREADS; + } } all_threads_mask = nbits(global.nbthread); #endif From 11643c4775cceeaeb78e72f93128ad3612a778d9 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 9 Mar 2023 11:39:51 +0100 Subject: [PATCH 070/140] BUG/MINOR: thread: report thread and group counts in the correct order In case too many thread groups are needed for the threads, we emit an error indicating the problem. Unfortunately the threads and groups counts were reversed. This can be backported to 2.6. (cherry picked from commit cf0d0eedc74736d83ff38856eabdd6121c2ee253) Signed-off-by: Willy Tarreau (cherry picked from commit ddbeac5ca7eba07fe9e683ec9b5d97e62bafb8df) Signed-off-by: Willy Tarreau --- src/thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/thread.c b/src/thread.c index c5dd12324..369bf8a77 100644 --- a/src/thread.c +++ b/src/thread.c @@ -1042,7 +1042,7 @@ int thread_map_to_groups() q = ut / ug; r = ut % ug; if ((q + !!r) > MAX_THREADS_PER_GROUP) { - ha_alert("Too many remaining unassigned threads (%d) for thread groups (%d). Please increase thread-groups or make sure to keep thread numbers contiguous\n", ug, ut); + ha_alert("Too many remaining unassigned threads (%d) for thread groups (%d). Please increase thread-groups or make sure to keep thread numbers contiguous\n", ut, ug); return -1; } From be18ccc18ae71c1ef54c1af67886bf46f6da8ab7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 26 Jan 2023 15:34:31 +0100 Subject: [PATCH 071/140] BUG/MINOR: ring: release the backing store name on exit ASAN found that a ring equipped with a backing store did not release the store name on exit. This should be backported to 2.7. (cherry picked from commit b91910955a449fbd1feec405ad9da921e00ee9a5) Signed-off-by: Willy Tarreau (cherry picked from commit a937f05e84e247cf91a188856d1736275468dab4) [wt: ring backing-store was backported to 2.6 and needs this one] Signed-off-by: Willy Tarreau --- src/sink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sink.c b/src/sink.c index 8f95ef984..6dbda87aa 100644 --- a/src/sink.c +++ b/src/sink.c @@ -1399,6 +1399,7 @@ static void sink_deinit() msync(area, size, MS_SYNC); munmap(area, size); + ha_free(&sink->store); } else ring_free(sink->ctx.ring); From 8f2294b16084c2deb96f7c6bd36d5a41be3f7294 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 Jul 2022 18:36:49 +0200 Subject: [PATCH 072/140] MEDIUM: epoll: don't synchronously delete migrated FDs Between 1.8 and 1.9 commit d9e7e36c6 ("BUG/MEDIUM: epoll/threads: use one epoll_fd per thread") split the epoll poller to use one poller per thread (and this was backported to 1.8). This patch added a call to epoll_ctl(DEL) on return from the I/O handler as a safe way to deal with a detected thread migration when that code was still quite fragile. One aspect of this choice was that by then we wanted to maintain support for the rare old bogus epoll implementations that failed to remove events on close(), so risking to lose the event was not an option. Later in 2.5, commit 200bd50b7 ("MEDIUM: fd: rely more on fd_update_events() to detect changes") changed the code to perform most of the operations inside fd_update_events(), but it maintained that oddity, to the point that strictly all pollers except epoll now just add an update to be dealt with at the next round. This approach is much more efficient, because under load and server-side connection reuse, it's perfectly possible for a thread to see the same FD several times in a poll loop, the first time to relinquish it after a migration, then the other thread makes a request, gets its response, and still during the same loop for the first one, grabbing an idle connection to send a request and wait for a response will program a new update on this FD. By using a synchronous epoll_ctl(DEL), we effectively lose the opportunity to aggregate certain changes in the same update. Some tests performed locally with 8 threads and one server show that on average, by using an update instead of a synchronous call, we reduce the number of epoll_ctl() calls by 25-30% (under low loads it will probably not change anything). So this patch implements the same method for all pollers and replaces the synchronous epoll_ctl() with an update. (cherry picked from commit 0d023774bff9cfc49ab5665f73c6258f0f60f9b3) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- src/ev_epoll.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ev_epoll.c b/src/ev_epoll.c index c5d267f4b..a74929607 100644 --- a/src/ev_epoll.c +++ b/src/ev_epoll.c @@ -218,7 +218,6 @@ static void _do_poll(struct poller *p, int exp, int wake) /* process polled events */ for (count = 0; count < status; count++) { - struct epoll_event ev; unsigned int n, e; int ret; @@ -241,9 +240,8 @@ static void _do_poll(struct poller *p, int exp, int wake) if (ret == FD_UPDT_MIGRATED) { /* FD has been migrated */ - epoll_ctl(epoll_fd[tid], EPOLL_CTL_DEL, fd, &ev); - _HA_ATOMIC_AND(&polled_mask[fd].poll_recv, ~tid_bit); - _HA_ATOMIC_AND(&polled_mask[fd].poll_send, ~tid_bit); + if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) + fd_updt[fd_nbupdt++] = fd; } } /* the caller will take care of cached events */ From 24795953d416175d02a81045806263c0bb38a71c Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 Jul 2022 18:55:37 +0200 Subject: [PATCH 073/140] MEDIUM: poller: program the update in fd_update_events() for a migrated FD When an FD is migrated, all pollers program an update. That's useless code duplication, and when thread groups will be supported, this will require an extra round of locking just to verify the update_mask on return. Let's just program the update direction from fd_update_events() as it already does for closed FDs, this becomes more logical. (cherry picked from commit b1093c6ba21ae3d41946f3a7bb4886b8b90e7c27) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- src/ev_epoll.c | 9 +-------- src/ev_evports.c | 7 ++----- src/ev_kqueue.c | 9 +-------- src/ev_poll.c | 10 ++-------- src/fd.c | 4 ++++ 5 files changed, 10 insertions(+), 29 deletions(-) diff --git a/src/ev_epoll.c b/src/ev_epoll.c index a74929607..72d8dcad0 100644 --- a/src/ev_epoll.c +++ b/src/ev_epoll.c @@ -219,7 +219,6 @@ static void _do_poll(struct poller *p, int exp, int wake) for (count = 0; count < status; count++) { unsigned int n, e; - int ret; e = epoll_events[count].events; fd = epoll_events[count].data.fd; @@ -236,13 +235,7 @@ static void _do_poll(struct poller *p, int exp, int wake) ((e & EPOLLHUP) ? FD_EV_SHUT_RW : 0) | ((e & EPOLLERR) ? FD_EV_ERR_RW : 0); - ret = fd_update_events(fd, n); - - if (ret == FD_UPDT_MIGRATED) { - /* FD has been migrated */ - if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) - fd_updt[fd_nbupdt++] = fd; - } + fd_update_events(fd, n); } /* the caller will take care of cached events */ } diff --git a/src/ev_evports.c b/src/ev_evports.c index de5ebce56..7ef26277c 100644 --- a/src/ev_evports.c +++ b/src/ev_evports.c @@ -246,12 +246,9 @@ static void _do_poll(struct poller *p, int exp, int wake) */ ret = fd_update_events(fd, n); - /* disable polling on this instance if the FD was migrated */ - if (ret == FD_UPDT_MIGRATED) { - if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) - fd_updt[fd_nbupdt++] = fd; + /* polling will be on this instance if the FD was migrated */ + if (ret == FD_UPDT_MIGRATED) continue; - } /* * This file descriptor was closed during the processing of diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c index b6172ca96..991e39b8a 100644 --- a/src/ev_kqueue.c +++ b/src/ev_kqueue.c @@ -182,7 +182,6 @@ static void _do_poll(struct poller *p, int exp, int wake) for (count = 0; count < status; count++) { unsigned int n = 0; - int ret; fd = kev[count].ident; @@ -201,13 +200,7 @@ static void _do_poll(struct poller *p, int exp, int wake) n |= FD_EV_ERR_RW; } - ret = fd_update_events(fd, n); - - if (ret == FD_UPDT_MIGRATED) { - /* FD was migrated, let's stop polling it */ - if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) - fd_updt[fd_nbupdt++] = fd; - } + fd_update_events(fd, n); } } diff --git a/src/ev_poll.c b/src/ev_poll.c index 730f94e62..3882a4e38 100644 --- a/src/ev_poll.c +++ b/src/ev_poll.c @@ -219,8 +219,8 @@ static void _do_poll(struct poller *p, int exp, int wake) for (count = 0; status > 0 && count < nbfd; count++) { unsigned int n; - int ret; int e = poll_events[count].revents; + fd = poll_events[count].fd; if ((e & POLLRDHUP) && !(cur_poller.flags & HAP_POLL_F_RDHUP)) @@ -241,13 +241,7 @@ static void _do_poll(struct poller *p, int exp, int wake) ((e & POLLHUP) ? FD_EV_SHUT_RW : 0) | ((e & POLLERR) ? FD_EV_ERR_RW : 0); - ret = fd_update_events(fd, n); - - if (ret == FD_UPDT_MIGRATED) { - /* FD was migrated, let's stop polling it */ - if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) - fd_updt[fd_nbupdt++] = fd; - } + fd_update_events(fd, n); } } diff --git a/src/fd.c b/src/fd.c index 0e059c1c9..951bc0382 100644 --- a/src/fd.c +++ b/src/fd.c @@ -489,6 +489,10 @@ int fd_update_events(int fd, uint evts) if (!(tmask & tid_bit)) { /* a takeover has started */ activity[tid].poll_skip_fd++; + + /* Let the poller know this FD was lost */ + if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) + fd_updt[fd_nbupdt++] = fd; return FD_UPDT_MIGRATED; } } while (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &rmask, rmask | tid_bit)); From 3693af3781d2025ac67d72c209fc6ee4b9a9fe63 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 6 Jul 2022 16:23:41 +0200 Subject: [PATCH 074/140] MAJOR: fd: remove pending updates upon real close Dealing with long-lasting updates that outlive a close() is always going to be quite a problem, not because of the thread that will discover such updates late, but mostly due to the shared update_list that will have an entry on hold making it difficult to reuse it, and requiring that the fd's tgid is changed and the update_mask reset from a safe location. After careful inspection, it turns out that all our pollers that support automatic event removal upon close() do not need any extra bookkeeping, and that poll and select that use an internal representation already provide a poller->clo() callback that is already used to update the local event. As such, it is already safe to reset the update mask and to remove the event from the shared list just before the final close, because nothing remains to be done with this FD by the poller. Doing so considerably simplifies the handling of updates, which will only have to be inspected by the pollers, while the writers can continue to consider that the entries are always valid. Another benefit is that it will be possible to reduce contention on the update_list by just having one update_list per group (left to be done later if needed). (cherry picked from commit 2f36d902aa4dc6c1d0a53db306e0138db3d607ca) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- src/fd.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/fd.c b/src/fd.c index 951bc0382..09231eecd 100644 --- a/src/fd.c +++ b/src/fd.c @@ -314,9 +314,20 @@ void _fd_delete_orphan(int fd) DISGUISE(setsockopt(fd, SOL_SOCKET, SO_LINGER, (struct linger *) &nolinger, sizeof(struct linger))); } + + /* It's expected that a close() will result in the FD disappearing from + * pollers, but some pollers may have some internal bookkeeping to be + * done prior to the call (e.g. remove references from internal tables). + */ if (cur_poller.clo) cur_poller.clo(fd); + /* we don't want this FD anymore in the global list */ + fd_rm_from_fd_list(&update_list, fd, offsetof(struct fdtab, update)); + + /* no more updates on this FD are relevant anymore */ + HA_ATOMIC_STORE(&fdtab[fd].update_mask, 0); + port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); polled_mask[fd].poll_recv = polled_mask[fd].poll_send = 0; @@ -327,6 +338,7 @@ void _fd_delete_orphan(int fd) #endif fdinfo[fd].port_range = NULL; fdtab[fd].owner = NULL; + /* perform the close() call last as it's what unlocks the instant reuse * of this FD by any other thread. */ From 213a8245dd2aaef7e2c5ca6f976cb533ee48d152 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 6 Jul 2022 16:20:11 +0200 Subject: [PATCH 075/140] MINOR: fd: delete unused updates on close() After a poller's ->clo() was called to completely terminate operations on an FD, there's no reason for keeping updates on this FD, so if any updates were already programmed it would be nice if we could delete them. Tests show that __fd_clo() is called roughly half of the time with the last FD from the local update list, which possibly makes sense if a close has to appear after a polling change resulting from an incomplete read or the end of a send(). We can detect this and remove the last entry, which gives less work to do during the update() call, and eliminates most of the poll_drop_fd event reports. Note that while tempting, this must not be backported because it's only safe to be done now that fd_delete_orphan() clears the update mask as we need to be certain not to miss it: - if the update mask is kept up with no entry, we can miss future updates ; - if the update mask is cleared too fast, it may result in failure to add a shared event. (cherry picked from commit 8e2c0fa8e554d189ecb98092bd984790cb05736b) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- src/fd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/fd.c b/src/fd.c index 09231eecd..edce10722 100644 --- a/src/fd.c +++ b/src/fd.c @@ -327,6 +327,8 @@ void _fd_delete_orphan(int fd) /* no more updates on this FD are relevant anymore */ HA_ATOMIC_STORE(&fdtab[fd].update_mask, 0); + if (fd_nbupdt > 0 && fd_updt[fd_nbupdt - 1] == fd) + fd_nbupdt--; port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port); polled_mask[fd].poll_recv = polled_mask[fd].poll_send = 0; From dc7dfefcfe68632384c6f9b932f6cbaff3f7c6a9 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 5 Jul 2022 05:16:13 +0200 Subject: [PATCH 076/140] MEDIUM: fd: add the tgid to the fd and pass it to fd_insert() The file descriptors will need to know the thread group ID in addition to the mask. This extends fd_insert() to take the tgid, and will store it into the FD. In the FD, the tgid is stored as a combination of tgid on the lower 16 bits and a refcount on the higher 16 bits. This allows to know when it's really possible to trust the tgid and the running mask. If a refcount is higher than 1 it indeed indicates another thread else might be in the process of updating these values. Since a closed FD must necessarily have a zero refcount, a test was added to fd_insert() to make sure that it is the case. (cherry picked from commit 9464bb1f05b5e0046716b4573a567d3450ac7604) [wt: this is necessary to fix a design bug introduced in 2.5. The patch was simplified as 2.6 does not yet support thread groups, only group 1 is added and the fd_insert() API does not change] Signed-off-by: Willy Tarreau --- include/haproxy/fd-t.h | 7 +++++++ include/haproxy/fd.h | 8 ++++++++ src/fd.c | 1 + 3 files changed, 16 insertions(+) diff --git a/include/haproxy/fd-t.h b/include/haproxy/fd-t.h index 64416d6dc..32c173ae0 100644 --- a/include/haproxy/fd-t.h +++ b/include/haproxy/fd-t.h @@ -153,6 +153,12 @@ struct fdlist { /* info about one given fd. Note: only align on cache lines when using threads; * 32-bit small archs can put everything in 32-bytes when threads are disabled. + * refc_tgid is an atomic 32-bit composite value made of 16 higher bits + * containing a refcount on tgid and the running_mask, and 16 lower bits + * containing a thread group ID. The tgid may only be changed when refc is zero + * and running may only be checked/changed when refc is held and shows the + * reader is alone. An FD with tgid zero belongs to nobody. For now only tgid 1 + * is supported. */ struct fdtab { unsigned long running_mask; /* mask of thread IDs currently using the fd */ @@ -162,6 +168,7 @@ struct fdtab { void (*iocb)(int fd); /* I/O handler */ void *owner; /* the connection or listener associated with this fd, NULL if closed */ unsigned int state; /* FD state for read and write directions (FD_EV_*) + FD_POLL_* */ + unsigned int refc_tgid; /* refcounted tgid, updated atomically */ #ifdef DEBUG_FD unsigned int event_count; /* number of events reported */ #endif diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h index c67e5c61b..7c7e0823d 100644 --- a/include/haproxy/fd.h +++ b/include/haproxy/fd.h @@ -317,6 +317,12 @@ static inline void fd_want_send(int fd) updt_fd_polling(fd); } +/* returns the tgid from an fd (masks the refcount) */ +static forceinline int fd_tgid(int fd) +{ + return _HA_ATOMIC_LOAD(&fdtab[fd].refc_tgid) & 0xFFFF; +} + /* remove tid_bit from the fd's running mask and returns the bits that remain * after the atomic operation. */ @@ -337,10 +343,12 @@ static inline void fd_insert(int fd, void *owner, void (*iocb)(int fd), unsigned BUG_ON(fd >= global.maxsock); BUG_ON(fdtab[fd].owner != NULL); BUG_ON(fdtab[fd].state != 0); + BUG_ON(fdtab[fd].refc_tgid != 0); fdtab[fd].owner = owner; fdtab[fd].iocb = iocb; fdtab[fd].state = 0; + fdtab[fd].refc_tgid = 1; #ifdef DEBUG_FD fdtab[fd].event_count = 0; #endif diff --git a/src/fd.c b/src/fd.c index edce10722..09b45b614 100644 --- a/src/fd.c +++ b/src/fd.c @@ -334,6 +334,7 @@ void _fd_delete_orphan(int fd) polled_mask[fd].poll_recv = polled_mask[fd].poll_send = 0; fdtab[fd].state = 0; + fdtab[fd].refc_tgid = 0; #ifdef DEBUG_FD fdtab[fd].event_count = 0; From f3c8832ffae0758bcb66103e3c93143c0c86db45 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 8 Jul 2022 10:23:01 +0200 Subject: [PATCH 077/140] MINOR: cli/fd: show fd's tgid and refcount in "show fd" We really need to display these values now. [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- src/cli.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cli.c b/src/cli.c index 54d0b2712..236311ad0 100644 --- a/src/cli.c +++ b/src/cli.c @@ -1333,7 +1333,7 @@ static int cli_io_handler_show_fd(struct appctx *appctx) suspicious = 1; chunk_printf(&trash, - " %5d : st=0x%06x(%c%c %c%c%c%c%c W:%c%c%c R:%c%c%c) tmask=0x%lx umask=0x%lx prmsk=0x%lx pwmsk=0x%lx owner=%p iocb=%p(", + " %5d : st=0x%06x(%c%c %c%c%c%c%c W:%c%c%c R:%c%c%c) ref=%#x gid=%d tmask=0x%lx umask=0x%lx prmsk=0x%lx pwmsk=0x%lx owner=%p iocb=%p(", fd, fdt.state, (fdt.state & FD_CLONED) ? 'C' : 'c', @@ -1349,6 +1349,8 @@ static int cli_io_handler_show_fd(struct appctx *appctx) (fdt.state & FD_EV_SHUT_R) ? 'S' : 's', (fdt.state & FD_EV_READY_R) ? 'R' : 'r', (fdt.state & FD_EV_ACTIVE_R) ? 'A' : 'a', + (fdt.refc_tgid >> 4) & 0xffff, + (fdt.refc_tgid) & 0xffff, fdt.thread_mask, fdt.update_mask, polled_mask[fd].poll_recv, polled_mask[fd].poll_send, From 1d61c21742f25c0ca6a5eb730907dd895fd5fdba Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 6 Jul 2022 18:27:13 +0200 Subject: [PATCH 078/140] MINOR: fd: add functions to manipulate the FD's tgid The FD's tgid is refcounted and must be atomically manipulated. Function fd_grab_tgid() will increase the refcount but only if the tgid matches the one in argument (likely the current one). fd_claim_tgid() will be used to self-assign the tgid after waiting for its refcount to reach zero. fd_drop_tgid() will be used to drop a temporarily held tgid. All of these are needed to prevent an FD from being reassigned to another group, either when inspecting/modifying the running_mask, or when checking for updates, in order to be certain that the mask being seen corresponds to the desired group. Note that once at least one bit is set in the running mask of an active FD, it cannot be closed, thus not migrated, thus the reference does not need to be held long. (cherry picked from commit 080373ea3896781963f647fd1dcea30ab46fa50f) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- include/haproxy/fd.h | 67 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h index 7c7e0823d..4fb9062a7 100644 --- a/include/haproxy/fd.h +++ b/include/haproxy/fd.h @@ -323,6 +323,73 @@ static forceinline int fd_tgid(int fd) return _HA_ATOMIC_LOAD(&fdtab[fd].refc_tgid) & 0xFFFF; } +/* Release a tgid previously taken by fd_grab_tgid() */ +static forceinline void fd_drop_tgid(int fd) +{ + HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); +} + +/* Grab a reference to the FD's TGID, and return the tgid. Note that a TGID of + * zero indicates the FD was closed, thus also fails (i.e. no need to drop it). + * On non-zero (success), the caller must release it using fd_drop_tgid(). + */ +static inline uint fd_take_tgid(int fd) +{ + uint old; + + old = _HA_ATOMIC_FETCH_ADD(&fdtab[fd].refc_tgid, 0x10000) & 0xffff; + if (likely(old)) + return old; + HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); + return 0; +} + +/* Reset a tgid without affecting the refcount */ +static forceinline void fd_reset_tgid(int fd) +{ + HA_ATOMIC_AND(&fdtab[fd].refc_tgid, 0xffff0000U); +} + +/* Try to grab a reference to the FD's TGID, but only if it matches the + * requested one (i.e. it succeeds with TGID refcnt held, or fails). Note that + * a TGID of zero indicates the FD was closed, thus also fails. It returns + * non-zero on success, in which case the caller must then release it using + * fd_drop_tgid(), or zero on failure. The function is optimized for use + * when it's likely that the tgid matches the desired one as it's by far + * the most common. + */ +static inline uint fd_grab_tgid(int fd, uint desired_tgid) +{ + uint old; + + old = _HA_ATOMIC_FETCH_ADD(&fdtab[fd].refc_tgid, 0x10000) & 0xffff; + if (likely(old == desired_tgid)) + return 1; + HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); + return 0; +} + +/* Set the FD's TGID to the new value with a refcount of 1, waiting for the + * current refcount to become 0, to cover the rare possibly that a late + * competing thread would be touching the tgid or the running mask in parallel. + * The caller must call fd_drop_tgid() once done. + */ +static inline void fd_claim_tgid(int fd, uint desired_tgid) +{ + uint old; + + BUG_ON(!desired_tgid); + + desired_tgid += 0x10000; // refcount=1 + old = desired_tgid; + while (1) { + old &= 0xffff; + if (_HA_ATOMIC_CAS(&fdtab[fd].refc_tgid, &old, desired_tgid)) + break; + __ha_cpu_relax(); + } +} + /* remove tid_bit from the fd's running mask and returns the bits that remain * after the atomic operation. */ From ccd8e3d5cacb61d9d1bde7f4adec4793e9aa2079 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 Jul 2022 14:09:35 +0200 Subject: [PATCH 079/140] MINOR: fd: add fd_get_running() to atomically return the running mask The running mask is only valid if the tgid is the expected one. This function takes a reference on the tgid before reading the running mask, so that both are checked at once. It returns either the mask or zero if the tgid differs, thus providing a simple way for a caller to check if it still holds the FD. (cherry picked from commit ceffd17f52b6b1aa481365fc6f9b88e8efc436e8) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- include/haproxy/fd.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h index 4fb9062a7..555ad66bf 100644 --- a/include/haproxy/fd.h +++ b/include/haproxy/fd.h @@ -390,6 +390,27 @@ static inline void fd_claim_tgid(int fd, uint desired_tgid) } } +/* atomically read the running mask if the tgid matches, or returns zero if it + * does not match. This is meant for use in code paths where the bit is expected + * to be present and will be sufficient to protect against a short-term group + * migration (e.g. takss and return from iocb). + */ +static inline ulong fd_get_running(int fd, uint desired_tgid) +{ + ulong ret = 0; + uint old; + + /* TODO: may also be checked using an atomic double-load from a DWCAS + * on compatible architectures, which wouldn't require to modify nor + * restore the original value. + */ + old = _HA_ATOMIC_ADD_FETCH(&fdtab[fd].refc_tgid, 0x10000); + if (likely((old & 0xffff) == desired_tgid)) + ret = _HA_ATOMIC_LOAD(&fdtab[fd].running_mask); + _HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); + return ret; +} + /* remove tid_bit from the fd's running mask and returns the bits that remain * after the atomic operation. */ From b926e860183e64e575706ef9b1e23a2322721b23 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 6 Jul 2022 18:47:38 +0200 Subject: [PATCH 080/140] MAJOR: fd: grab the tgid before manipulating running We now grab a reference to the FD's tgid before manipulating the running_mask so that we're certain it corresponds to our own group (hence bits), and we drop it once we've set the bit. For now there's no measurable performance impact in doing this, which is great. The lock can be observed by perf top as taking a small share of the time spent in fd_update_events(), itself taking no more than 0.28% of CPU under 8 threads. However due to the fact that the thread groups are not yet properly spread across the pollers and the thread masks are still wrong, this will trigger some BUG_ON() in fd_insert() after a few tens of thousands of connections when threads other than those of group 1 are reached, and this is expected. (cherry picked from commit 0dc1cc93b669df7674ec9036f217db1bceb68b01) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- src/fd.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/fd.c b/src/fd.c index 09b45b614..79484efe6 100644 --- a/src/fd.c +++ b/src/fd.c @@ -360,6 +360,11 @@ void fd_delete(int fd) */ BUG_ON(fd < 0 || fd >= global.maxsock); + /* the tgid cannot change before a complete close so we should never + * face the situation where we try to close an fd that was reassigned. + */ + BUG_ON(fd_tgid(fd) != 1 && !thread_isolated()); + /* we must postpone removal of an FD that may currently be in use * by another thread. This can happen in the following two situations: * - after a takeover, the owning thread closes the connection but @@ -427,12 +432,18 @@ int fd_takeover(int fd, void *expected_owner) /* we must be alone to work on this idle FD. If not, it means that its * poller is currently waking up and is about to use it, likely to * close it on shut/error, but maybe also to process any unexpectedly - * pending data. + * pending data. It's also possible that the FD was closed and + * reassigned to another thread group, so let's be careful. */ - old = 0; - if (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &old, tid_bit)) + if (unlikely(!fd_grab_tgid(fd, 1))) return -1; + old = 0; + if (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &old, tid_bit)) { + fd_drop_tgid(fd); + return -1; + } + /* success, from now on it's ours */ HA_ATOMIC_STORE(&fdtab[fd].thread_mask, tid_bit); @@ -444,6 +455,9 @@ int fd_takeover(int fd, void *expected_owner) /* we're done with it */ HA_ATOMIC_AND(&fdtab[fd].running_mask, ~tid_bit); + + /* no more changes planned */ + fd_drop_tgid(fd); return 0; } @@ -490,6 +504,17 @@ int fd_update_events(int fd, uint evts) th_ctx->flags &= ~TH_FL_STUCK; // this thread is still running + if (unlikely(!fd_grab_tgid(fd, 1))) { + /* the FD changed to another tgid, we can't safely + * check it anymore. The bits in the masks are not + * ours anymore and we're not allowed to touch them. + * Ours have already been cleared and the FD was + * closed in between so we can safely leave now. + */ + activity[tid].poll_drop_fd++; + return FD_UPDT_CLOSED; + } + /* do nothing if the FD was taken over under us */ do { /* make sure we read a synchronous copy of rmask and tmask @@ -508,10 +533,15 @@ int fd_update_events(int fd, uint evts) /* Let the poller know this FD was lost */ if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) fd_updt[fd_nbupdt++] = fd; + + fd_drop_tgid(fd); return FD_UPDT_MIGRATED; } } while (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &rmask, rmask | tid_bit)); + /* with running we're safe now, we can drop the reference */ + fd_drop_tgid(fd); + locked = (tmask != tid_bit); /* OK now we are guaranteed that our thread_mask was present and From cd883aafb2dc3897b233f574440d02aebe08223b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 Jul 2022 15:57:17 +0200 Subject: [PATCH 081/140] MINOR: fd: make fd_clr_running() return the previous value instead It's an AND so it destroys information and due to this there's a call place where we have to perform two reads to know the previous value then to change it. With a fetch-and-and instead, in a single operation we can know if the bit was previously present, which is more efficient. (cherry picked from commit d6e198761281e755e5b96272ed2a751df096efdf) [wt: this is necessary to fix a design bug introduced in 2.5; ctx adjustment for tid_bit instead of ti->ltid_bit] Signed-off-by: Willy Tarreau --- include/haproxy/fd.h | 6 +++--- src/fd.c | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h index 555ad66bf..39a10f217 100644 --- a/include/haproxy/fd.h +++ b/include/haproxy/fd.h @@ -411,12 +411,12 @@ static inline ulong fd_get_running(int fd, uint desired_tgid) return ret; } -/* remove tid_bit from the fd's running mask and returns the bits that remain - * after the atomic operation. +/* remove tid_bit from the fd's running mask and returns the value before the + * atomic operation, so that the caller can know if it was present. */ static inline long fd_clr_running(int fd) { - return _HA_ATOMIC_AND_FETCH(&fdtab[fd].running_mask, ~tid_bit); + return _HA_ATOMIC_FETCH_AND(&fdtab[fd].running_mask, ~tid_bit); } /* Prepares for being polled */ diff --git a/src/fd.c b/src/fd.c index 79484efe6..52599c801 100644 --- a/src/fd.c +++ b/src/fd.c @@ -383,7 +383,7 @@ void fd_delete(int fd) HA_ATOMIC_OR(&fdtab[fd].running_mask, tid_bit); HA_ATOMIC_STORE(&fdtab[fd].thread_mask, 0); - if (fd_clr_running(fd) == 0) + if (fd_clr_running(fd) == ti->ltid_bit) _fd_delete_orphan(fd); } @@ -600,8 +600,7 @@ int fd_update_events(int fd, uint evts) * This is detected by both thread_mask and running_mask being 0 after * we remove ourselves last. */ - if ((fdtab[fd].running_mask & tid_bit) && - fd_clr_running(fd) == 0 && !fdtab[fd].thread_mask) { + if (fd_clr_running(fd) == tid_bit && !fdtab[fd].thread_mask) { _fd_delete_orphan(fd); return FD_UPDT_CLOSED; } From aa4b8a5e2ae484b92f5587eefd927fca4ba065a2 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 7 Jul 2022 15:05:55 +0200 Subject: [PATCH 082/140] MEDIUM: fd: make fd_insert/fd_delete atomically update fd.tgid These functions need to set/reset the FD's tgid but when they're called there may still be wakeups on other threads that discover late updates and have to touch the tgid at the same time. As such, it is not possible to just read/write the tgid there. It must only be done using operations that are compatible with what other threads may be doing. As we're using inc/dec on the refcount, it's safe to AND the area to zero the lower part when resetting the value. However, in order to set the value, there's no other choice but fd_claim_tgid() which will assign it only if possible (via a CAS). This is convenient in the end because it protects the FD's masks from being modified by late threads, so while we hold this refcount we can safely reset the thread_mask and a few other elements. A debug test for non-null masks was added to fd_insert() as it must not be possible to face this situation thanks to the protection offered by the tgid. (cherry picked from commit ddedc1662487600d5124cb6d4972396438b9953c) [wt: this is necessary to fix a design bug introduced in 2.5; ctx adjustment for s/tgid/1 and the fact that we don't enforce the mask in fd_insert() in 2.6 and older. Note that this patch had a bug with newstate not being used for fd.state, and needs commit 7e94b40a] Signed-off-by: Willy Tarreau --- include/haproxy/fd.h | 22 +++++++++++++++------- src/fd.c | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h index 39a10f217..c975218c7 100644 --- a/include/haproxy/fd.h +++ b/include/haproxy/fd.h @@ -423,6 +423,12 @@ static inline long fd_clr_running(int fd) static inline void fd_insert(int fd, void *owner, void (*iocb)(int fd), unsigned long thread_mask) { extern void sock_conn_iocb(int); + int newstate; + + /* conn_fd_handler should support edge-triggered FDs */ + newstate = 0; + if ((global.tune.options & GTUNE_FD_ET) && iocb == sock_conn_iocb) + newstate |= FD_ET_POSSIBLE; /* This must never happen and would definitely indicate a bug, in * addition to overwriting some unexpected memory areas. @@ -431,21 +437,23 @@ static inline void fd_insert(int fd, void *owner, void (*iocb)(int fd), unsigned BUG_ON(fd >= global.maxsock); BUG_ON(fdtab[fd].owner != NULL); BUG_ON(fdtab[fd].state != 0); - BUG_ON(fdtab[fd].refc_tgid != 0); + + BUG_ON(thread_mask == 0); + + fd_claim_tgid(fd, 1); + + BUG_ON(fdtab[fd].running_mask); fdtab[fd].owner = owner; fdtab[fd].iocb = iocb; fdtab[fd].state = 0; - fdtab[fd].refc_tgid = 1; + fdtab[fd].thread_mask = thread_mask; + fd_drop_tgid(fd); + #ifdef DEBUG_FD fdtab[fd].event_count = 0; #endif - /* conn_fd_handler should support edge-triggered FDs */ - if ((global.tune.options & GTUNE_FD_ET) && fdtab[fd].iocb == sock_conn_iocb) - fdtab[fd].state |= FD_ET_POSSIBLE; - - fdtab[fd].thread_mask = thread_mask; /* note: do not reset polled_mask here as it indicates which poller * still knows this FD from a possible previous round. */ diff --git a/src/fd.c b/src/fd.c index 52599c801..5a47de570 100644 --- a/src/fd.c +++ b/src/fd.c @@ -334,7 +334,7 @@ void _fd_delete_orphan(int fd) polled_mask[fd].poll_recv = polled_mask[fd].poll_send = 0; fdtab[fd].state = 0; - fdtab[fd].refc_tgid = 0; + fd_reset_tgid(fd); #ifdef DEBUG_FD fdtab[fd].event_count = 0; From f10cbcd7ed39ce632a25c0d90b0e926bd46c3652 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 19 Jul 2022 12:04:18 +0200 Subject: [PATCH 083/140] BUG/MINOR: fd: Properly init the fd state in fd_insert() When a new fd is inserted in the fdtab array, its state is initialized. The "newstate" variable is used to compute the right state (0 by default, but FD_ET_POSSIBLE flag is set if edge-triggered is supported for the fd). However, this variable is never used and the fd state is always set to 0. Now, the fd state is initialized with "newstate" variable. This bug was introduced by commit ddedc1662 ("MEDIUM: fd: make fd_insert/fd_delete atomically update fd.tgid"). No backport needed. (cherry picked from commit 7e94b40a22fab080b072c4757d487a40d2c6f828) [wt: this is necessary to fix a design bug introduced in 2.5] Signed-off-by: Willy Tarreau --- include/haproxy/fd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h index c975218c7..d815f1049 100644 --- a/include/haproxy/fd.h +++ b/include/haproxy/fd.h @@ -446,7 +446,7 @@ static inline void fd_insert(int fd, void *owner, void (*iocb)(int fd), unsigned fdtab[fd].owner = owner; fdtab[fd].iocb = iocb; - fdtab[fd].state = 0; + fdtab[fd].state = newstate; fdtab[fd].thread_mask = thread_mask; fd_drop_tgid(fd); From 7b80ec3eab35416a1de1dbface205cf49fccec16 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 8 Jul 2022 15:36:14 +0200 Subject: [PATCH 084/140] MEDIUM: fd: quit fd_update_events() when FD is closed The IOCB might have closed the FD itself, so it's not an error to have fd.tgid==0 or anything else, nor to have a null running_mask. In fact there are different conditions under which we can leave the IOCB, all of them have been enumerated in the code's comments (namely FD still valid and used, hence has running bit, FD closed but not yet reassigned thus running==0, FD closed and reassigned, hence different tgid and running becomes irrelevant, just like all other masks). For this reason we have no other solution but to try to grab the tgid on return before checking the other bits. In practice it doesn't represent a big cost, because if the FD was closed and reassigned, it's instantly detected and the bit is immediately released without blocking other threads, and if the FD wasn't closed this doesn't prevent it from being migrated to another thread. In the worst case a close by another thread after a migration will be postponed till the moment the running bit is cleared, which is the same as before. (cherry picked from commit 0b51eab76475b9f07b60815315d4c93f09e1b27f) [wt: this is necessary to fix a design bug introduced in 2.5; ctx adjustment for tid_bit instead of ti->ltid_bit] Signed-off-by: Willy Tarreau --- src/fd.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/src/fd.c b/src/fd.c index 5a47de570..3cb47355c 100644 --- a/src/fd.c +++ b/src/fd.c @@ -595,12 +595,52 @@ int fd_update_events(int fd, uint evts) fdtab[fd].iocb(fd); } + /* + * We entered iocb with running set and with the valid tgid. + * Since then, this is what could have happened: + * - another thread tried to close the FD (e.g. timeout task from + * another one that owns it). We still have running set, but not + * tmask. We must call fd_clr_running() then _fd_delete_orphan() + * if we were the last one. + * + * - the iocb tried to close the FD => bit no more present in running, + * nothing to do. If it managed to close it, the poller's ->clo() + * has already been called. + * + * - after we closed, the FD was reassigned to another thread in + * another group => running not present, tgid differs, nothing to + * do because if it got reassigned it indicates it was already + * closed. + * + * There's no risk of takeover of the valid FD here during this period. + * Also if we still have running, immediately after we release it, the + * events above might instantly happen due to another thread taking + * over. + * + * As such, the only cases where the FD is still relevant are: + * - tgid still set and running still set (most common) + * - tgid still valid but running cleared due to fd_delete(): we may + * still need to stop polling otherwise we may keep it enabled + * while waiting for other threads to close it. + * And given that we may need to program a tentative update in case we + * don't immediately close, it's easier to grab the tgid during the + * whole check. + */ + + if (!fd_grab_tgid(fd, tgid)) + return FD_UPDT_CLOSED; + + tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask); + /* another thread might have attempted to close this FD in the mean * time (e.g. timeout task) striking on a previous thread and closing. * This is detected by both thread_mask and running_mask being 0 after - * we remove ourselves last. + * we remove ourselves last. There is no risk the FD gets reassigned + * to a different group since it's not released until the real close() + * in _fd_delete_orphan(). */ - if (fd_clr_running(fd) == tid_bit && !fdtab[fd].thread_mask) { + if (fd_clr_running(fd) == tid_bit && !tmask) { + fd_drop_tgid(fd); _fd_delete_orphan(fd); return FD_UPDT_CLOSED; } @@ -615,6 +655,7 @@ int fd_update_events(int fd, uint evts) fd_updt[fd_nbupdt++] = fd; } + fd_drop_tgid(fd); return FD_UPDT_DONE; } From ee9539c9fb11221ffc1765a539af9a0a0d888cf7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 9 Jul 2022 23:55:43 +0200 Subject: [PATCH 085/140] MAJOR: poller: only touch/inspect the update_mask under tgid protection With thread groups and group-local masks, the update_mask cannot be touched nor even checked if it may change below us. In order to avoid this, we have to grab a reference to the FD's tgid before checking the update mask. The operations are cheap enough so that we don't notice it in performance tests. This is expected because the risk of meeting a reassigned FD during an update remains very low. It's worth noting that the tgid cannot be trusted during startup nor during soft-stop since that may come from anywhere at the moment. Since soft-stop runs under thread isolation we use that hint to decide whether or not to check that the FD's tgid matches the current one. The modification is applied to the 3 thread-aware pollers, i.e. epoll, kqueue, and evports. Also one poll_drop counter was missing for shared updates, though it might be hard to trigger it. With this change applied, thread groups are usable in benchmarks. (cherry picked from commit 1f947cb39ed92036131d71ea06404429208c4be3) [wt: this is necessary to fix a design bug introduced in 2.5; ctx adjustment for tid_bit instead of ti->ltid_bit] Signed-off-by: Willy Tarreau --- src/ev_epoll.c | 41 +++++++++++++++++++++++++++++++---------- src/ev_evports.c | 39 +++++++++++++++++++++++++++++---------- src/ev_kqueue.c | 37 ++++++++++++++++++++++++++++--------- 3 files changed, 88 insertions(+), 29 deletions(-) diff --git a/src/ev_epoll.c b/src/ev_epoll.c index 72d8dcad0..3fc868d16 100644 --- a/src/ev_epoll.c +++ b/src/ev_epoll.c @@ -156,16 +156,24 @@ static void _do_poll(struct poller *p, int exp, int wake) for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { fd = fd_updt[updt_idx]; - _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit); - if (!fdtab[fd].owner) { + if (!fd_grab_tgid(fd, 1)) { + /* was reassigned */ activity[tid].poll_drop_fd++; continue; } - _update_fd(fd); + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit); + + if (fdtab[fd].owner) + _update_fd(fd); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); } fd_nbupdt = 0; - /* Scan the global update list */ + + /* Scan the shared update list */ for (old_fd = fd = update_list.first; fd != -1; fd = fdtab[fd].update.next) { if (fd == -2) { fd = old_fd; @@ -175,13 +183,26 @@ static void _do_poll(struct poller *p, int exp, int wake) fd = -fd -4; if (fd == -1) break; - if (fdtab[fd].update_mask & tid_bit) - done_update_polling(fd); + + if (!fd_grab_tgid(fd, 1)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + if (!(fdtab[fd].update_mask & tid_bit)) { + fd_drop_tgid(fd); + continue; + } + + done_update_polling(fd); + + if (fdtab[fd].owner) + _update_fd(fd); else - continue; - if (!fdtab[fd].owner) - continue; - _update_fd(fd); + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); } thread_idle_now(); diff --git a/src/ev_evports.c b/src/ev_evports.c index 7ef26277c..1830b462d 100644 --- a/src/ev_evports.c +++ b/src/ev_evports.c @@ -124,16 +124,24 @@ static void _do_poll(struct poller *p, int exp, int wake) for (i = 0; i < fd_nbupdt; i++) { fd = fd_updt[i]; - _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit); - if (fdtab[fd].owner == NULL) { + if (!fd_grab_tgid(fd, 1)) { + /* was reassigned */ activity[tid].poll_drop_fd++; continue; } - _update_fd(fd); + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit); + + if (fdtab[fd].owner) + _update_fd(fd); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); } fd_nbupdt = 0; - /* Scan the global update list */ + + /* Scan the shared update list */ for (old_fd = fd = update_list.first; fd != -1; fd = fdtab[fd].update.next) { if (fd == -2) { fd = old_fd; @@ -143,13 +151,24 @@ static void _do_poll(struct poller *p, int exp, int wake) fd = -fd -4; if (fd == -1) break; - if (fdtab[fd].update_mask & tid_bit) - done_update_polling(fd); + + if (!fd_grab_tgid(fd, 1)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + if (!(fdtab[fd].update_mask & tid_bit)) + continue; + + done_update_polling(fd); + + if (fdtab[fd].owner) + _update_fd(fd); else - continue; - if (!fdtab[fd].owner) - continue; - _update_fd(fd); + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); } thread_idle_now(); diff --git a/src/ev_kqueue.c b/src/ev_kqueue.c index 991e39b8a..7ac9d6eef 100644 --- a/src/ev_kqueue.c +++ b/src/ev_kqueue.c @@ -100,12 +100,20 @@ static void _do_poll(struct poller *p, int exp, int wake) for (updt_idx = 0; updt_idx < fd_nbupdt; updt_idx++) { fd = fd_updt[updt_idx]; - _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit); - if (!fdtab[fd].owner) { + if (!fd_grab_tgid(fd, 1)) { + /* was reassigned */ activity[tid].poll_drop_fd++; continue; } - changes = _update_fd(fd, changes); + + _HA_ATOMIC_AND(&fdtab[fd].update_mask, ~tid_bit); + + if (fdtab[fd].owner) + changes = _update_fd(fd, changes); + else + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); } /* Scan the global update list */ for (old_fd = fd = update_list.first; fd != -1; fd = fdtab[fd].update.next) { @@ -117,13 +125,24 @@ static void _do_poll(struct poller *p, int exp, int wake) fd = -fd -4; if (fd == -1) break; - if (fdtab[fd].update_mask & tid_bit) - done_update_polling(fd); + + if (!fd_grab_tgid(fd, 1)) { + /* was reassigned */ + activity[tid].poll_drop_fd++; + continue; + } + + if (!(fdtab[fd].update_mask & tid_bit)) + continue; + + done_update_polling(fd); + + if (fdtab[fd].owner) + changes = _update_fd(fd, changes); else - continue; - if (!fdtab[fd].owner) - continue; - changes = _update_fd(fd, changes); + activity[tid].poll_drop_fd++; + + fd_drop_tgid(fd); } thread_idle_now(); From 01a4ebcdd06d958ab45e1a0043078777949d84ca Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 15 Jul 2022 20:12:31 +0200 Subject: [PATCH 086/140] MEDIUM: fd: support broadcasting updates for foreign groups in updt_fd_polling We're still facing the situation where it's impossible to update an FD for a foreign group. That's of particular concern when disabling/enabling listeners (e.g. pause/resume on signals) since we don't decide which thread gets the signal and it needs to process all listeners at once. Fortunately, not that much is unprotected in FDs. This patch adds a test for tgid's equality in updt_fd_polling() so that if a change is applied for a foreing group, then it's detected and taken care of separately. The method consists in forcing the update on all bound threads in this group, adding it to the group's update_list, and sending a wake-up as would be done for a remote thread in the local group, except that this is done by grabbing a reference to the FD's tgid. Thanks to this, SIGTTOU/SIGTTIN now work for nbtgroups > 1 (after that was temporarily broken by "MEDIUM: fd/poller: make the update-list per-group"). (cherry picked from commit cfdd20a0b2d3c49db91a37f8b7b65e5ca5dd2e2b) [wt: this is necessary to fix a design bug introduced in 2.5; only the take/check/drop part was kept since we support only one group before 2.7] Signed-off-by: Willy Tarreau --- src/fd.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/fd.c b/src/fd.c index 3cb47355c..7009ffeb5 100644 --- a/src/fd.c +++ b/src/fd.c @@ -463,6 +463,14 @@ int fd_takeover(int fd, void *expected_owner) void updt_fd_polling(const int fd) { + uint tgrp = fd_take_tgid(fd); + + /* closed ? may happen */ + if (!tgrp) + return; + + fd_drop_tgid(fd); + if (all_threads_mask == 1UL || (fdtab[fd].thread_mask & all_threads_mask) == tid_bit) { if (HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) return; From 410612e9d878be7cb624623713cf89499089390b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 4 Mar 2023 15:33:24 +0100 Subject: [PATCH 087/140] BUG/MAJOR: fd/thread: fix race between updates and closing FD While running some L7 retries tests, Christopher and I stumbled upon a very strange behavior showing some occasional server timeouts when the server closes keep-alive connections quickly. The issue can be reproduced with the following config: global expose-experimental-directives #tune.fd.edge-triggered on # can speed up the issue defaults mode http timeout client 5s timeout server 10s timeout connect 2s listen f bind :8001 http-reuse always retry-on all-retryable-errors server next 127.0.0.1:8002 frontend b bind :8002 timeout http-keep-alive 1 # one ms redirect location / Sending fast requests without reusing the client connection on port 8001 with a single connection and at least 3 threads on haproxy occasionally shows some glitches pauses (below with timeout server 2s): $ taskset -c 2,3 h1load -e -t 1 -r 1 -c 1 http://127.0.0.1:8001/ # time conns tot_conn tot_req tot_bytes err cps rps bps ttfb 1 1 9794 9793 959714 0 9k79 9k79 7M67 42.94u 2 1 9794 9793 959714 0 0.00 0.00 0.00 - 3 1 9794 9793 959714 0 0.00 0.00 0.00 - 4 0 16015 16015 1569470 0 6k22 6k22 4M87 522.9u 5 0 18657 18656 1828190 2 2k63 2k63 2M06 39.22u If this doesn't happen, limiting to a request rate close to 1/timeout may help. What is happening is that after several migrations, a late report via fd_update_events() may detect that the thread is not welcome, and will want to program an update so that the current thread's poller disables its polling on it. It is allowed to do so because it used fd_grab_tgid(). But what if _fd_delete_orphan() was just starting to be called and already reset the update_mask ? We'll end up with a bit present in the update mask, then _fd_delete_orphan() resets the tgid, which will prevent the poller from consuming that update. The update is not needed anymore since the FD was closed, but in this case nobody will clear this bit until the same FD is reused again and cleared. And as long as the thread's bit remains in the update_mask, no new updates will be programmed for the next use of this FD on the same thread since due to the bit being present, fd_nbupdt will not be changed. This is what is causing this timeout. The fix consists in making sure _fd_delete_orphan() waits for the occasional watchers to leave, and to do this before clearing the update_mask. This will be either fd_update_events() trying to check its thread_mask, or the poller checking its updates, so that's pretty short. But it definitely closes this race. This fix is needed since the introduction of fd_grab_tgid(), hence 2.7. Note that while testing the fix, another related issue concerning the atomicity of running_mask vs thread_mask popped up and will have to be fixed till 2.5 as part of another patch. It may make the tests for this fix occasionally tigger a few BUG_ON() or face a null conn->subs in sock_conn_iocb(), though these ones are much more difficult to trigger. This is not caused by this fix. (cherry picked from commit 237e6a0d655df300b59b98b8a809f95bacb7206f) Signed-off-by: Willy Tarreau --- src/fd.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/fd.c b/src/fd.c index 7009ffeb5..b15ad6274 100644 --- a/src/fd.c +++ b/src/fd.c @@ -322,6 +322,19 @@ void _fd_delete_orphan(int fd) if (cur_poller.clo) cur_poller.clo(fd); + /* now we're about to reset some of this FD's fields. We don't want + * anyone to grab it anymore and we need to make sure those which could + * possibly have stumbled upon it right now are leaving before we + * proceed. This is done in two steps. First we reset the tgid so that + * fd_take_tgid() and fd_grab_tgid() fail, then we wait for existing + * ref counts to drop. Past this point we're alone dealing with the + * FD's thead/running/update/polled masks. + */ + fd_reset_tgid(fd); + + while (_HA_ATOMIC_LOAD(&fdtab[fd].refc_tgid) != 0) // refc==0 ? + __ha_cpu_relax(); + /* we don't want this FD anymore in the global list */ fd_rm_from_fd_list(&update_list, fd, offsetof(struct fdtab, update)); @@ -334,7 +347,6 @@ void _fd_delete_orphan(int fd) polled_mask[fd].poll_recv = polled_mask[fd].poll_send = 0; fdtab[fd].state = 0; - fd_reset_tgid(fd); #ifdef DEBUG_FD fdtab[fd].event_count = 0; From 73cf297c28e97a8e7f658c7513e3d7d707a85bdd Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 7 Mar 2023 10:11:02 -0800 Subject: [PATCH 088/140] BUG/MAJOR: fd/threads: close a race on closing connections after takeover As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between updates and closing FD"), a race was found during stress tests involving heavy backend connection reuse with many competing closes. Here the problem is complex. The analysis in commit f69fea64e ("MAJOR: fd: get rid of the DWCAS when setting the running_mask") that removed the DWCAS in 2.5 overlooked a few races. First, a takeover from thread1 could happen just after fd_update_events() in thread2 validates it holds the tmask bit in the CAS loop. Since thread1 releases running_mask after the operation, thread2 will succeed the CAS and both will believe the FD is theirs. This does explain the occasional crashes seen with h1_io_cb() being called on a bad context, or sock_conn_iocb() seeing conn->subs vanish after checking it. This issue can be addressed using a DWCAS in both fd_takeover() and fd_update_events() as it was before the patch above but this is not portable to all archs and is not easy to adapt for those lacking it, due to some operations still happening only on individual masks after the thread groups were added. Second, the checks after fd_clr_running() for the current thread being the last one is not sufficient: at the exact moment the operation completes, another thread may also set and drop the running bit and see itself as alone, and both can call _fd_close_orphan() in parallel. In order to prevent this from happening, we cannot rely on the absence of others, we need an explicit flag indicating that the FD must be closed. One approach that was attempted consisted in playing with the thread_mask but that was not reliable since it could still match between the late deletion and the early insertion that follows. Instead, a new FD flag was added, FD_MUST_CLOSE, that exactly indicates that the call to _fd_delete_orphan() must be done. It is set by fd_delete(), and atomically cleared by the first one which checks it, and which is the only one to call _fd_delete_orphan(). With both points addressed, there's no more visible race left: - takeover() only happens under the connection list's lock and cannot compete with fd_delete() since fd_delete() must first remove the connection from the list before deleting the FD. That's also why it doesn't need to call _fd_delete_orphan() when dropping its running bit. - takeover() sets its running bit then atomically replaces the thread mask, so that until that's done, it doesn't validate the condition to end the synchonization loop in fd_update_events(). Once it's OK, the previous thread's bit is lost, and this is checked for in fd_update_events() - fd_update_events() can compete with fd_delete() at various places which are explained above. Since fd_delete() clears the thread mask as after setting its running bit and after setting the FD_MUST_CLOSE bit, the synchronization loop guarantees that the thread mask is seen before going further, and that once it's seen, the FD_MUST_CLOSE flag is already present. - fd_delete() may start while fd_update_events() has already started, but fd_delete() must hold a bit in thread_mask before starting, and that is checked by the first test in fd_update_events() before setting the running_mask. - the poller's _update_fd() will not compete against _fd_delete_orphan() nor fd_insert() thanks to the fd_grab_tgid() that's always done before updating the polled_mask, and guarantees that we never pretend that a polled_mask has a bit before the FD is added. The issue is very hard to reproduce and is extremely time-sensitive. Some tests were required with a 1-ms timeout with request rates closely matching 1 kHz per server, though certain tests sometimes benefitted from saturation. It was found that adding the following slowdown at a few key places helped a lot and managed to trigger the bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread setup: { volatile int i = 10000; while (i--); } Particularly, placing it at key places where only one of running_mask or thread_mask is set and not the other one yet (e.g. after the synchronization loop in fd_update_events or after dropping the running bit) did yield great results. Many thanks to Olivier Houchard for this expert help analysing these races and reviewing candidate fixes. The patch must be backported to 2.5. Note that 2.6 does not have tgid in FDs, and that it requires a change of output on fd_clr_running() as we need the previous bit. This is provided by carefully backporting commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous value instead"). Tests have shown that the lack of tgid is a showstopper for 2.6 and that unless a better workaround is found, it could still be preferable to backport the minimum pieces required for fd_grab_tgid() to 2.6 so that it stays stable long. (cherry picked from commit cd8914bc525668bc4a41278deee3430c69d29500) Signed-off-by: Willy Tarreau (cherry picked from commit 877a13db06dbbbd2b85e3f50489af0560d29eb61) Signed-off-by: Willy Tarreau --- include/haproxy/fd-t.h | 2 + src/fd.c | 110 +++++++++++++++++++++++++++++++---------- 2 files changed, 85 insertions(+), 27 deletions(-) diff --git a/include/haproxy/fd-t.h b/include/haproxy/fd-t.h index 32c173ae0..d36dabc85 100644 --- a/include/haproxy/fd-t.h +++ b/include/haproxy/fd-t.h @@ -70,6 +70,7 @@ enum { #define FD_EXPORTED_BIT 20 /* FD is exported and must not be closed */ #define FD_EXCL_SYSCALL_BIT 21 /* a syscall claims exclusivity on this FD */ #define FD_DISOWN_BIT 22 /* this fd will be closed by some external code */ +#define FD_MUST_CLOSE_BIT 23 /* this fd will be closed by some external code */ /* and flag values */ @@ -111,6 +112,7 @@ enum { #define FD_EXPORTED (1U << FD_EXPORTED_BIT) #define FD_EXCL_SYSCALL (1U << FD_EXCL_SYSCALL_BIT) #define FD_DISOWN (1U << FD_DISOWN_BIT) +#define FD_MUST_CLOSE (1U << FD_MUST_CLOSE_BIT) /* FD update status after fd_update_events() */ enum { diff --git a/src/fd.c b/src/fd.c index b15ad6274..0dbaef8da 100644 --- a/src/fd.c +++ b/src/fd.c @@ -391,12 +391,21 @@ void fd_delete(int fd) * will not take new bits in its running_mask so we have the guarantee * that the last thread eliminating running_mask is the one allowed to * safely delete the FD. Most of the time it will be the current thread. + * We still need to set and check the one-shot flag FD_MUST_CLOSE + * to take care of the rare cases where a thread wakes up on late I/O + * before the thread_mask is zero, and sets its bit in the running_mask + * just after the current thread finishes clearing its own bit, hence + * the two threads see themselves as last ones (which they really are). */ HA_ATOMIC_OR(&fdtab[fd].running_mask, tid_bit); + HA_ATOMIC_OR(&fdtab[fd].state, FD_MUST_CLOSE); HA_ATOMIC_STORE(&fdtab[fd].thread_mask, 0); - if (fd_clr_running(fd) == ti->ltid_bit) - _fd_delete_orphan(fd); + if (fd_clr_running(fd) == tid_bit) { + if (HA_ATOMIC_BTR(&fdtab[fd].state, FD_MUST_CLOSE_BIT)) { + _fd_delete_orphan(fd); + } + } } /* makes the new fd non-blocking and clears all other O_* flags; this is meant @@ -535,29 +544,47 @@ int fd_update_events(int fd, uint evts) return FD_UPDT_CLOSED; } - /* do nothing if the FD was taken over under us */ + /* Do not take running_mask if not strictly needed (will trigger a + * cosmetic BUG_ON() in fd_insert() anyway if done). + */ + tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask); + if (!(tmask & tid_bit)) + goto do_update; + + HA_ATOMIC_OR(&fdtab[fd].running_mask, tid_bit); + + /* From this point, our bit may possibly be in thread_mask, but it may + * still vanish, either because a takeover completed just before taking + * the bit above with the new owner deleting the FD, or because a + * takeover started just before taking the bit. In order to make sure a + * started takeover is complete, we need to verify that all bits of + * running_mask are present in thread_mask, since takeover first takes + * running then atomically replaces thread_mask. Once it's stable, if + * our bit remains there, no further takeover may happen because we + * hold running, but if our bit is not there it means we've lost the + * takeover race and have to decline touching the FD. Regarding the + * risk of deletion, our bit in running_mask prevents fd_delete() from + * finalizing the close, and the caller will leave the FD with a zero + * thread_mask and the FD_MUST_CLOSE flag set. It will then be our + * responsibility to close it. + */ do { - /* make sure we read a synchronous copy of rmask and tmask - * (tmask is only up to date if it reflects all of rmask's - * bits). - */ - do { - rmask = _HA_ATOMIC_LOAD(&fdtab[fd].running_mask); - tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask); - } while (rmask & ~tmask); + rmask = _HA_ATOMIC_LOAD(&fdtab[fd].running_mask); + tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask); + rmask &= ~tid_bit; + } while (rmask & ~tmask); - if (!(tmask & tid_bit)) { - /* a takeover has started */ - activity[tid].poll_skip_fd++; + /* Now tmask is stable. Do nothing if the FD was taken over under us */ - /* Let the poller know this FD was lost */ - if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) - fd_updt[fd_nbupdt++] = fd; + if (!(tmask & tid_bit)) { + /* a takeover has started */ + activity[tid].poll_skip_fd++; - fd_drop_tgid(fd); - return FD_UPDT_MIGRATED; - } - } while (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &rmask, rmask | tid_bit)); + if (fd_clr_running(fd) == tid_bit) + goto closed_or_migrated; + + goto do_update; + } /* with running we're safe now, we can drop the reference */ fd_drop_tgid(fd); @@ -654,16 +681,16 @@ int fd_update_events(int fd, uint evts) /* another thread might have attempted to close this FD in the mean * time (e.g. timeout task) striking on a previous thread and closing. - * This is detected by both thread_mask and running_mask being 0 after + * This is detected by us being the last owners of a running_mask bit, + * and the thread_mask being zero. At the moment we release the running + * bit, a takeover may also happen, so in practice we check for our loss + * of the thread_mask bitboth thread_mask and running_mask being 0 after * we remove ourselves last. There is no risk the FD gets reassigned * to a different group since it's not released until the real close() * in _fd_delete_orphan(). */ - if (fd_clr_running(fd) == tid_bit && !tmask) { - fd_drop_tgid(fd); - _fd_delete_orphan(fd); - return FD_UPDT_CLOSED; - } + if (fd_clr_running(fd) == tid_bit && !(tmask & tid_bit)) + goto closed_or_migrated; /* we had to stop this FD and it still must be stopped after the I/O * cb's changes, so let's program an update for this. @@ -677,6 +704,35 @@ int fd_update_events(int fd, uint evts) fd_drop_tgid(fd); return FD_UPDT_DONE; + + closed_or_migrated: + /* We only come here once we've last dropped running and the FD is + * not for us as per !(tmask & tid_bit). It may imply we're + * responsible for closing it. Otherwise it's just a migration. + */ + if (HA_ATOMIC_BTR(&fdtab[fd].state, FD_MUST_CLOSE_BIT)) { + fd_drop_tgid(fd); + _fd_delete_orphan(fd); + return FD_UPDT_CLOSED; + } + + /* So we were alone, no close bit, at best the FD was migrated, at + * worst it's in the process of being closed by another thread. We must + * be ultra-careful as it can be re-inserted by yet another thread as + * the result of socket() or accept(). Let's just tell the poller the + * FD was lost. If it was closed it was already removed and this will + * only cost an update for nothing. + */ + + do_update: + /* The FD is not closed but we don't want the poller to wake up for + * it anymore. + */ + if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, tid)) + fd_updt[fd_nbupdt++] = fd; + + fd_drop_tgid(fd); + return FD_UPDT_MIGRATED; } /* Tries to send parts from followed by parts from From da5699aca118150d5d3455396aab95057e69a216 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Fri, 17 Feb 2023 09:51:20 +0100 Subject: [PATCH 089/140] MINOR: h3/hq-interop: handle no data in decode_qcs() with FIN set Properly handle a STREAM frame with no data but the FIN bit set at the application layer. H3 and hq-interop decode_qcs() callback have been adjusted to not return early in this case. If the FIN bit is accepted, a HTX EOM must be inserted for the upper stream layer. If the FIN is rejected because the stream cannot be closed, a proper CONNECTION_CLOSE error will be triggered. A new utility function qcs_http_handle_standalone_fin() has been implemented in the qmux_http module. This allows to simply add the HTX EOM on qcs HTX buffer. If the HTX buffer is empty, a EOT is first added to ensure it will be transmitted above. This commit will allow to properly handle FIN notify through an empty STREAM frame. However, it is not sufficient as currently qcc_recv() skip the decode_qcs() invocation when the offset is already received. This will be fixed in the next commit. This should be backported up to 2.6 along with the next patch. (cherry picked from commit 381d8137e31d941c9143a1dc8b5760d29f388fef) Signed-off-by: Christopher Faulet (cherry picked from commit c07a1c32d98812326dfff05cca02f8af44123b7e) [ad: adjusted context : no h3 stream error level on 2.6] Signed-off-by: Amaury Denoyelle --- include/haproxy/qmux_http.h | 2 ++ src/h3.c | 8 ++++++-- src/hq_interop.c | 8 ++++++++ src/mux_quic.c | 4 ++-- src/qmux_http.c | 22 ++++++++++++++++++++++ 5 files changed, 40 insertions(+), 4 deletions(-) diff --git a/include/haproxy/qmux_http.h b/include/haproxy/qmux_http.h index a7dbe7cc3..98151db16 100644 --- a/include/haproxy/qmux_http.h +++ b/include/haproxy/qmux_http.h @@ -12,6 +12,8 @@ size_t qcs_http_snd_buf(struct qcs *qcs, struct buffer *buf, size_t count, char *fin); size_t qcs_http_reset_buf(struct qcs *qcs, struct buffer *buf, size_t count); +void qcs_http_handle_standalone_fin(struct qcs *qcs); + #endif /* USE_QUIC */ #endif /* _HAPROXY_MUX_QUIC_HTTP_H */ diff --git a/src/h3.c b/src/h3.c index 3b3db2497..66fc8d548 100644 --- a/src/h3.c +++ b/src/h3.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -824,8 +825,6 @@ static ssize_t h3_decode_qcs(struct qcs *qcs, struct buffer *b, int fin) ssize_t total = 0, ret; h3_debug_printf(stderr, "%s: STREAM ID: %lu\n", __func__, qcs->id); - if (!b_data(b)) - return 0; if (quic_stream_is_uni(qcs->id) && !(h3s->flags & H3_SF_UNI_INIT)) { if ((ret = h3_init_uni_stream(h3c, qcs, b)) < 0) @@ -855,6 +854,11 @@ static ssize_t h3_decode_qcs(struct qcs *qcs, struct buffer *b, int fin) return -1; } + if (!b_data(b) && fin && quic_stream_is_bidi(qcs->id)) { + qcs_http_handle_standalone_fin(qcs); + return 0; + } + while (b_data(b) && !(qcs->flags & QC_SF_DEM_FULL)) { uint64_t ftype, flen; char last_stream_frame = 0; diff --git a/src/hq_interop.c b/src/hq_interop.c index 37bb2e219..175b92dec 100644 --- a/src/hq_interop.c +++ b/src/hq_interop.c @@ -7,6 +7,7 @@ #include #include #include +#include static ssize_t hq_interop_decode_qcs(struct qcs *qcs, struct buffer *b, int fin) { @@ -19,6 +20,13 @@ static ssize_t hq_interop_decode_qcs(struct qcs *qcs, struct buffer *b, int fin) size_t size = b_size(b); size_t data = b_data(b); + if (!data && fin) { + /* FIN is notified with an empty STREAM frame. */ + BUG_ON(!qcs->sd); /* sd must already be attached here */ + qcs_http_handle_standalone_fin(qcs); + return 0; + } + b_alloc(&htx_buf); htx = htx_from_buf(&htx_buf); diff --git a/src/mux_quic.c b/src/mux_quic.c index 417e8d1db..dc438acd8 100644 --- a/src/mux_quic.c +++ b/src/mux_quic.c @@ -756,10 +756,10 @@ static int qcc_decode_qcs(struct qcc *qcc, struct qcs *qcs) goto err; } - if (ret) { + if (ret) qcs_consume(qcs, ret); + if (ret || (!b_data(&b) && fin)) qcs_notify_recv(qcs); - } TRACE_LEAVE(QMUX_EV_QCS_RECV, qcc->conn, qcs); return 0; diff --git a/src/qmux_http.c b/src/qmux_http.c index 3ce4a3438..6eedf0c4a 100644 --- a/src/qmux_http.c +++ b/src/qmux_http.c @@ -107,3 +107,25 @@ size_t qcs_http_reset_buf(struct qcs *qcs, struct buffer *buf, size_t count) return count; } + +/* Utility function which can be used by app layer an empty STREAM frame is + * received with FIN bit set for stream. It will ensure that HTX EOM is + * properly inserted in app_buf. + */ +void qcs_http_handle_standalone_fin(struct qcs *qcs) +{ + struct buffer *appbuf; + struct htx *htx = NULL; + + appbuf = qc_get_buf(qcs, &qcs->rx.app_buf); + BUG_ON(!appbuf); + + htx = htx_from_buf(appbuf); + if (htx_is_empty(htx)) { + if (!htx_add_endof(htx, HTX_BLK_EOT)) { + ABORT_NOW(); /* cannot happen for empty HTX message. */ + } + } + htx->flags |= HTX_FL_EOM; + htx_to_buf(htx, appbuf); +} From 2974e70276400129261cc23b461c3da655566854 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Tue, 14 Feb 2023 15:36:36 +0100 Subject: [PATCH 090/140] BUG/MINOR: mux-quic: transfer FIN on empty STREAM frame Implement support for clients that emit the stream FIN with an empty STREAM frame. For that, qcc_recv() offset comparison has been adjusted. If offset has already been received but the FIN bit is now transmitted, do not skip the rest of the function and call application layer decode_qcs() callback. Without this, streams will be kept open forever as HTX EOM is never transfered to the upper stream layer. This behavior was observed with mvfst client prior to its patch 38c955a024aba753be8bf50fdeb45fba3ac23cfd Fix hq-interop (HTTP 0.9 over QUIC) This notably caused the interop multiplexing test to fail as unclosed streams on haproxy side prevented the emission of new MAX_STREAMS frame to the client. This shoud be backported up to 2.6. It also relies on previous commit : 381d8137e31d941c9143a1dc8b5760d29f388fef MINOR: h3/hq-interop: handle no data in decode_qcs() with FIN set (cherry picked from commit fa241939c7ce77014eae2913252e5de21bdaa4d0) Signed-off-by: Christopher Faulet (cherry picked from commit f3c43d180369ddcaf58cb7abb3db4494f35a2f29) [ad: context adjusted due to RESET_STREAM unsupported on 2.6] Signed-off-by: Amaury Denoyelle --- src/mux_quic.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/mux_quic.c b/src/mux_quic.c index dc438acd8..ee035bbd6 100644 --- a/src/mux_quic.c +++ b/src/mux_quic.c @@ -883,11 +883,8 @@ int qcc_recv(struct qcc *qcc, uint64_t id, uint64_t len, uint64_t offset, goto err; } - if (offset + len <= qcs->rx.offset) { - /* TODO offset may have been received without FIN first and now - * with it. In this case, it must be notified to be able to - * close the stream. - */ + if (offset + len < qcs->rx.offset || + (offset + len == qcs->rx.offset && (!fin || (qcs->flags & QC_SF_SIZE_KNOWN)))) { TRACE_DATA("already received offset", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); goto out; } @@ -930,9 +927,13 @@ int qcc_recv(struct qcc *qcc, uint64_t id, uint64_t len, uint64_t offset, offset = qcs->rx.offset; } - ret = ncb_add(&qcs->rx.ncbuf, offset - qcs->rx.offset, data, len, NCB_ADD_COMPARE); - if (ret != NCB_RET_OK) { - if (ret == NCB_RET_DATA_REJ) { + if (len) { + ret = ncb_add(&qcs->rx.ncbuf, offset - qcs->rx.offset, data, len, NCB_ADD_COMPARE); + switch (ret) { + case NCB_RET_OK: + break; + + case NCB_RET_DATA_REJ: /* RFC 9000 2.2. Sending and Receiving Data * * An endpoint could receive data for a stream at the @@ -946,12 +947,13 @@ int qcc_recv(struct qcc *qcc, uint64_t id, uint64_t len, uint64_t offset, TRACE_ERROR("overlapping data rejected", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV|QMUX_EV_PROTO_ERR, qcc->conn, qcs); qcc_emit_cc(qcc, QC_ERR_PROTOCOL_VIOLATION); - } - else if (ret == NCB_RET_GAP_SIZE) { + return 1; + + case NCB_RET_GAP_SIZE: TRACE_DATA("cannot bufferize frame due to gap size limit", QMUX_EV_QCC_RECV|QMUX_EV_QCS_RECV, qcc->conn, qcs); + return 1; } - return 1; } if (fin) @@ -962,7 +964,7 @@ int qcc_recv(struct qcc *qcc, uint64_t id, uint64_t len, uint64_t offset, qcs_close_remote(qcs); } - if (ncb_data(&qcs->rx.ncbuf, 0) && !(qcs->flags & QC_SF_DEM_FULL)) { + if ((ncb_data(&qcs->rx.ncbuf, 0) && !(qcs->flags & QC_SF_DEM_FULL)) || fin) { qcc_decode_qcs(qcc, qcs); qcc_refresh_timeout(qcc); } From e8e45d2f6e8206a3fcd0189953dc46fe67fe1eb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Thu, 9 Feb 2023 20:37:26 +0100 Subject: [PATCH 091/140] BUG/MINOR: quic: Possible unexpected counter incrementation on send*() errors Some counters could potentially be incremented even if send*() syscall returned no error when ret >= 0 and ret != sz. This could be the case for instance if a first call to send*() returned -1 with errno set to EINTR (or any previous syscall which set errno to a non-null value) and if the next call to send*() returned something positive and smaller than . Must be backported to 2.7 and 2.6. (cherry picked from commit 9fc10aff05462fe88bc117cda20d381dfb2ea9f7) Signed-off-by: Christopher Faulet (cherry picked from commit 0b7a478eadad2baf4602d67d7eeae396f70d6f6f) Signed-off-by: Amaury Denoyelle --- src/quic_sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/quic_sock.c b/src/quic_sock.c index a3d343e43..f096ca3e6 100644 --- a/src/quic_sock.c +++ b/src/quic_sock.c @@ -458,7 +458,7 @@ int qc_snd_buf(struct quic_conn *qc, const struct buffer *buf, size_t sz, (struct sockaddr *)&qc->peer_addr, get_addr_len(&qc->peer_addr)); } while (ret < 0 && errno == EINTR); - if (ret < 0 || ret != sz) { + if (ret < 0) { struct proxy *prx = qc->li->bind_conf->frontend; struct quic_counters *prx_counters = EXTRA_COUNTERS_GET(prx->extra_counters_fe, @@ -480,6 +480,9 @@ int qc_snd_buf(struct quic_conn *qc, const struct buffer *buf, size_t sz, return 1; } + if (ret != sz) + return 1; + /* we count the total bytes sent, and the send rate for 32-byte blocks. * The reason for the latter is that freq_ctr are limited to 4GB and * that it's not enough per second. From 3a2fe6142d0f7ca24c4a5f47814b33afd2b9d3c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Wed, 8 Feb 2023 17:43:13 +0100 Subject: [PATCH 092/140] BUG/MINOR: quic: Really cancel the connection timer from qc_set_timer() The ->expire field of the timer task to be cancelled was not reset to TICK_ETERNITY. Must be backported to 2.6 and 2.7. (cherry picked from commit dea329828271fee1e92f43043abb90ea3fef590a) Signed-off-by: Christopher Faulet (cherry picked from commit f30c5c977781f221a6f84feeee20fe0f8a84becd) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 26bd30760..c541df79d 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -768,8 +768,11 @@ static inline void qc_set_timer(struct quic_conn *qc) if (tick_isset(pto)) qc->timer = pto; out: - if (qc->timer_task && qc->timer != TICK_ETERNITY) { - if (tick_is_expired(qc->timer, now_ms)) { + if (qc->timer_task) { + if (qc->timer == TICK_ETERNITY) { + qc->timer_task->expire = TICK_ETERNITY; + } + else if (tick_is_expired(qc->timer, now_ms)) { TRACE_DEVEL("wakeup asap timer task", QUIC_EV_CONN_STIMER, qc); task_wakeup(qc->timer_task, TASK_WOKEN_MSG); } From 295b72e22e4d34939e97ba5e6ed6ff898678106a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Fri, 10 Feb 2023 16:35:43 +0100 Subject: [PATCH 093/140] BUG/MINOR: quic: Missing call to task_queue() in qc_idle_timer_do_rearm() The aim of this function is to rearm the idle timer. The ->expire field of the timer task was updated without being requeued. Some connection could be unexpectedly terminated. Must be backported to 2.6 and 2.7. (cherry picked from commit 1e8ef1bed6ef687e66578214f0b184aab37b4154) Signed-off-by: Christopher Faulet (cherry picked from commit 6176a60afb8cbb95622bf24b27f34a7b121c512b) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/quic_conn.c b/src/quic_conn.c index c541df79d..8c4bfb4e8 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -5151,6 +5151,7 @@ static void qc_idle_timer_do_rearm(struct quic_conn *qc) expire = QUIC_MAX(3 * quic_pto(qc), qc->max_idle_timeout); qc->idle_timer_task->expire = tick_add(now_ms, MS_TO_TICKS(expire)); + task_queue(qc->idle_timer_task); } /* Rearm the idle timer for QUIC connection depending on boolean From 7913782648335535dcee1a5226227f6c6d38e3af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Tue, 14 Feb 2023 16:00:18 +0100 Subject: [PATCH 094/140] BUG/MINOR: quic: Do not probe with too little Initial packets Before probing the Initial packet number space, verify that we can at least sent 1200 bytes by datagram. This may not be the case due to the amplification limit. Must be backported to 2.6 and 2.7. (cherry picked from commit 7c6d8f88df0cf08b33776ce8e4279f8813d94085) Signed-off-by: Christopher Faulet (cherry picked from commit e8b25661924dbe9746403189a0ab4aa81d4bcc40) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 8c4bfb4e8..c6941755e 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -4719,6 +4719,16 @@ static int quic_conn_enc_level_init(struct quic_conn *qc, return ret; } +/* Return 1 if connection may probe the Initial packet number space, 0 if not. + * This is not the case if the remote peer address is not validated and if + * it cannot send at least QUIC_INITIAL_PACKET_MINLEN bytes. + */ +static int qc_may_probe_ipktns(struct quic_conn *qc) +{ + return quic_peer_validated_addr(qc) || + (int)(3 * qc->rx.bytes - qc->tx.prep_bytes) >= QUIC_INITIAL_PACKET_MINLEN; +} + /* Callback called upon loss detection and PTO timer expirations. */ struct task *qc_process_timer(struct task *task, void *ctx, unsigned int state) { @@ -4750,24 +4760,39 @@ struct task *qc_process_timer(struct task *task, void *ctx, unsigned int state) qc->subs = NULL; } else { - qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; - pktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; if (pktns == &qc->pktns[QUIC_TLS_PKTNS_INITIAL]) { - TRACE_STATE("needs to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + if (qc_may_probe_ipktns(qc)) { + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; + pktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + TRACE_STATE("needs to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } + else { + TRACE_STATE("Cannot probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } if (qc->pktns[QUIC_TLS_PKTNS_HANDSHAKE].tx.in_flight) { + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; qc->pktns[QUIC_TLS_PKTNS_HANDSHAKE].flags |= QUIC_FL_PKTNS_PROBE_NEEDED; TRACE_STATE("needs to probe Handshake packet number space", QUIC_EV_CONN_TXPKT, qc); } } else if (pktns == &qc->pktns[QUIC_TLS_PKTNS_HANDSHAKE]) { TRACE_STATE("needs to probe Handshake packet number space", QUIC_EV_CONN_TXPKT, qc); + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; + pktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; if (qc->pktns[QUIC_TLS_PKTNS_INITIAL].tx.in_flight) { - qc->pktns[QUIC_TLS_PKTNS_INITIAL].flags |= QUIC_FL_PKTNS_PROBE_NEEDED; - TRACE_STATE("needs to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + if (qc_may_probe_ipktns(qc)) { + qc->pktns[QUIC_TLS_PKTNS_INITIAL].flags |= QUIC_FL_PKTNS_PROBE_NEEDED; + TRACE_STATE("needs to probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } + else { + TRACE_STATE("Cannot probe Initial packet number space", QUIC_EV_CONN_TXPKT, qc); + } } } else if (pktns == &qc->pktns[QUIC_TLS_PKTNS_01RTT]) { TRACE_STATE("needs to probe 01RTT packet number space", QUIC_EV_CONN_TXPKT, qc); + qc->flags |= QUIC_FL_CONN_RETRANS_NEEDED; + pktns->flags |= QUIC_FL_PKTNS_PROBE_NEEDED; } } } From 29165bffccf5587d5b3eb079cf2892a9b676a9f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Wed, 15 Feb 2023 11:55:21 +0100 Subject: [PATCH 095/140] BUG/MINOR: quic: Wrong initialization for io_cb_wakeup boolean This bug arrives with this commit: 982896961 MINOR: quic: split and rename qc_lstnr_pkt_rcv() The first block of code consists in possibly setting this variable to true. But it was already initialized to true before entering this code section. Should be initialized to false. Also take the opportunity to remove an unused "err" label. Must be backported to 2.6 and 2.7. (cherry picked from commit 8f7d22406cf35745708171dda8fa712d78a029ca) Signed-off-by: Christopher Faulet (cherry picked from commit d79a8f0f8136a5f384d0cf9a274eac21cf2c1343) [ad: adjusted context] Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index c6941755e..7e10287c3 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -6407,7 +6407,7 @@ static void qc_rx_pkt_handle(struct quic_conn *qc, struct quic_rx_packet *pkt, const struct quic_version *qv = pkt->version; struct quic_enc_level *qel = NULL; size_t b_cspace; - int io_cb_wakeup = 1; + int io_cb_wakeup = 0; if (pkt->flags & QUIC_FL_RX_PACKET_DGRAM_FIRST && !quic_peer_validated_addr(qc) && @@ -6472,7 +6472,6 @@ static void qc_rx_pkt_handle(struct quic_conn *qc, struct quic_rx_packet *pkt, drop: HA_ATOMIC_INC(&qc->prx_counters->dropped_pkt); - err: /* Wakeup the I/O handler callback if the PTO timer must be armed. * This cannot be done by this thread. */ From 2b3722942cf3e3538693e9c040cdfdacb3f1a81c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Thu, 16 Feb 2023 11:40:11 +0100 Subject: [PATCH 096/140] BUG/MINOR: quic: Do not drop too small datagrams with Initial packets When receiving an Initial packet a peer must drop it if the datagram is smaller than 1200. Before this patch, this is the entire datagram which was dropped. In such a case, drop the packet after having parsed its length. Must be backported to 2.6 and 2.7 (cherry picked from commit 35218c6357b441142b2af19e31c8991a28b97075) Signed-off-by: Christopher Faulet (cherry picked from commit 97c8d5767f9f2e4b07f52bf2cbd3a3ba32d4e839) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 7e10287c3..94c91cd90 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -6211,13 +6211,6 @@ static int quic_rx_pkt_parse(struct quic_rx_packet *pkt, goto drop; } - if (pkt->type == QUIC_PACKET_TYPE_INITIAL && - dgram->len < QUIC_INITIAL_PACKET_MINLEN) { - TRACE_PROTO("Too short datagram with an Initial packet", QUIC_EV_CONN_LPKT); - HA_ATOMIC_INC(&prx_counters->too_short_initial_dgram); - goto drop; - } - /* When multiple QUIC packets are coalesced on the same UDP datagram, * they must have the same DCID. */ @@ -6312,6 +6305,19 @@ static int quic_rx_pkt_parse(struct quic_rx_packet *pkt, pkt->pn_offset = buf - beg; pkt->len = pkt->pn_offset + len; + /* RFC 9000. Initial Datagram Size + * + * A server MUST discard an Initial packet that is carried in a UDP datagram + * with a payload that is smaller than the smallest allowed maximum datagram + * size of 1200 bytes. + */ + if (pkt->type == QUIC_PACKET_TYPE_INITIAL && + dgram->len < QUIC_INITIAL_PACKET_MINLEN) { + TRACE_PROTO("Too short datagram with an Initial packet", QUIC_EV_CONN_LPKT); + HA_ATOMIC_INC(&prx_counters->too_short_initial_dgram); + goto drop; + } + /* Interrupt parsing after packet length retrieval : this * ensures that only the packet is dropped but not the whole * datagram. From 99c9d5cb1e687beb6ef0b9dd09a95a52c671013c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Thu, 16 Feb 2023 17:30:53 +0100 Subject: [PATCH 097/140] BUG/MINOR: quic: Missing padding for short packets This was revealed by Amaury when setting tune.quic.frontend.max-streams-bidi to 8 and asking a client to open 12 streams. haproxy has to send short packets with little MAX_STREAMS frames encoded with 2 bytes. In addition to a packet number encoded with only one byte. In the case is the length of the encoded frames to be added to the packet plus the length of the packet number. Ensure the length of the packet is at least QUIC_PACKET_PN_MAXLEN adding a PADDING frame wich (QUIC_PACKET_PN_MAXLEN - ) as size. For instance with a two bytes MAX_STREAMS frames and a one byte packet number length, this adds one byte of padding. See https://datatracker.ietf.org/doc/html/rfc9001#name-header-protection-sample. Must be backported to 2.7 and 2.6. (cherry picked from commit 5faf577997314bcfbc25c8eef35b682058d2a999) Signed-off-by: Christopher Faulet (cherry picked from commit feb6c4e1e322bc2d51a010cd30dcdbb076f8fd9b) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 94c91cd90..d329a540c 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -7097,7 +7097,10 @@ static int qc_do_build_pkt(unsigned char *pos, const unsigned char *end, padding_len -= quic_int_getsize(len + padding_len) - len_sz; len += padding_len; } - else if (LIST_ISEMPTY(&frm_list) || len_frms == len) { + else if (len_frms && len_frms < QUIC_PACKET_PN_MAXLEN) { + len += padding_len = QUIC_PACKET_PN_MAXLEN - len_frms; + } + else if (LIST_ISEMPTY(&frm_list)) { if (qel->pktns->tx.pto_probe) { /* If we cannot send a frame, we send a PING frame. */ add_ping_frm = 1; From ec28b53567cc7e1325d4fe00d8b87ed66cbac732 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Tue, 7 Feb 2023 14:24:54 +0100 Subject: [PATCH 098/140] MINOR: quic: adjust request reject when MUX is already freed When the MUX is freed, the quic-conn layer may stay active until all streams acknowledgment are processed. In this interval, if a new stream is opened by the client, the quic-conn is thus now responsible to handle it. This is done by the emission of a STOP_SENDING. This process is closely related to HTTP/3 protocol despite being handled by the quic-conn layer. This highlights a flaw in our QUIC architecture which should be adjusted. To reflect this situation, the function qc_stop_sending_frm_enqueue() is renamed qc_h3_request_reject(). Also, internal H3 treatment such as uni-directional bypass has been moved inside the function. This commit is only a refactor. However, bug fix on next patches will rely on it so it should be backported up to 2.6. (cherry picked from commit 38836b6b3da227ee9be2f32632215578bcd61b55) Signed-off-by: Christopher Faulet (cherry picked from commit 7dff75afd0dbc72c5283cff7dcb7bed4e61c7943) [ad: adjusted context : replace non existent qc_frm_alloc by a simple pool_alloc()] Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index d329a540c..ed0f3077d 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2675,24 +2675,33 @@ static void qc_cc_err_count_inc(struct quic_conn *qc, struct quic_frame *frm) TRACE_LEAVE(QUIC_EV_CONN_CLOSE, qc); } -/* Enqueue a STOP_SENDING frame to send into 1RTT packet number space - * frame list to send. - * Return 1 if succeeded, 0 if not. +/* Cancel a request on connection for stream id . This is useful when + * the client opens a new stream but the MUX has already been released. A + * STOP_SENDING frame is prepared for emission. + * + * TODO this function is closely related to H3. Its place should be in H3 layer + * instead of quic-conn but this requires an architecture adjustment. + * + * Returns 1 on sucess else 0. */ -static int qc_stop_sending_frm_enqueue(struct quic_conn *qc, uint64_t id) +static int qc_h3_request_reject(struct quic_conn *qc, uint64_t id) { int ret = 0; struct quic_frame *frm; struct quic_enc_level *qel = &qc->els[QUIC_TLS_ENC_LEVEL_APP]; - uint64_t app_error_code; + const uint64_t app_error_code = H3_REQUEST_REJECTED; TRACE_ENTER(QUIC_EV_CONN_PRSHPKT, qc); - /* TODO: the mux may be released, we cannot have more - * information about the application error code to send - * at this time. + /* Do not emit rejection for unknown unidirectional stream as it is + * forbidden to close some of them (H3 control stream and QPACK + * encoder/decoder streams). */ - app_error_code = H3_REQUEST_REJECTED; + if (quic_stream_is_uni(id)) { + ret = 1; + goto out; + } + // fixme: zalloc frm = pool_zalloc(pool_head_quic_frame); if (!frm) { @@ -2912,19 +2921,10 @@ static int qc_parse_pkt_frms(struct quic_conn *qc, struct quic_rx_packet *pkt, } else { TRACE_DEVEL("No mux for new stream", QUIC_EV_CONN_PRSHPKT, qc); - if (qc->app_ops == &h3_ops && quic_stream_is_uni(stream->id)) { - /* Do not send STOP_SENDING frames for h3 unidirectional streams. - * TODO: this test should be removed when the connection closure - * will be more clean. - * At quic_conn level there is no mean to know that an application - * want to forbid stream closure requests to receivers. This is the - * case for the Control and QPACK h3 unidirectional streams. - */ - goto leave; + if (qc->app_ops == &h3_ops) { + if (!qc_h3_request_reject(qc, stream->id)) + TRACE_ERROR("could not enqueue STOP_SENDING frame", QUIC_EV_CONN_PRSHPKT, qc); } - - if (!qc_stop_sending_frm_enqueue(qc, stream->id)) - TRACE_ERROR("could not enqueue STOP_SENDING frame", QUIC_EV_CONN_PRSHPKT, qc); /* This packet will not be acknowledged */ goto leave; } From 03414c0bd1418737aabb513029a9299d7d3d9d0c Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Mon, 20 Feb 2023 10:31:27 +0100 Subject: [PATCH 099/140] BUG/MINOR: quic: also send RESET_STREAM if MUX released When the MUX is freed, the quic-conn layer may stay active until all streams acknowledgment are processed. In this interval, if a new stream is opened by the client, the quic-conn is thus now responsible to handle it. This is done by the emission of a STOP_SENDING. This process has been completed to also emit a RESET_STREAM with the same error code H3_REQUEST_REJECTED. This is done to conform with the H3 specification to invite the client to retry its request on a new connection. This should be backported up to 2.6. (cherry picked from commit 75463017123ad32ff1afe4fe95dd9088b7e9cba4) Signed-off-by: Christopher Faulet (cherry picked from commit eb284ce40ca3425694ac5e4e1aae7b9df652f883) [ad: replace non-existant wrapper for quic_frame alloc/dealloc by plain pool_alloc/pool_free] Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index ed0f3077d..95455e41e 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2677,7 +2677,7 @@ static void qc_cc_err_count_inc(struct quic_conn *qc, struct quic_frame *frm) /* Cancel a request on connection for stream id . This is useful when * the client opens a new stream but the MUX has already been released. A - * STOP_SENDING frame is prepared for emission. + * STOP_SENDING + RESET_STREAM frames are prepared for emission. * * TODO this function is closely related to H3. Its place should be in H3 layer * instead of quic-conn but this requires an architecture adjustment. @@ -2687,7 +2687,7 @@ static void qc_cc_err_count_inc(struct quic_conn *qc, struct quic_frame *frm) static int qc_h3_request_reject(struct quic_conn *qc, uint64_t id) { int ret = 0; - struct quic_frame *frm; + struct quic_frame *ss, *rs; struct quic_enc_level *qel = &qc->els[QUIC_TLS_ENC_LEVEL_APP]; const uint64_t app_error_code = H3_REQUEST_REJECTED; @@ -2703,17 +2703,32 @@ static int qc_h3_request_reject(struct quic_conn *qc, uint64_t id) } // fixme: zalloc - frm = pool_zalloc(pool_head_quic_frame); - if (!frm) { + ss = pool_zalloc(pool_head_quic_frame); + if (!ss) { TRACE_ERROR("failed to allocate quic_frame", QUIC_EV_CONN_PRSHPKT, qc); goto out; } - frm->type = QUIC_FT_STOP_SENDING; - frm->stop_sending.id = id; - frm->stop_sending.app_error_code = app_error_code; - LIST_INIT(&frm->reflist); - LIST_APPEND(&qel->pktns->tx.frms, &frm->list); + ss->type = QUIC_FT_STOP_SENDING; + ss->stop_sending.id = id; + ss->stop_sending.app_error_code = app_error_code; + LIST_INIT(&ss->reflist); + + rs = pool_zalloc(pool_head_quic_frame); + if (!rs) { + TRACE_ERROR("failed to allocate quic_frame", QUIC_EV_CONN_PRSHPKT, qc); + pool_free(pool_head_quic_frame, &ss); + goto out; + } + + rs->type = QUIC_FT_RESET_STREAM; + rs->reset_stream.id = id; + rs->reset_stream.app_error_code = app_error_code; + rs->reset_stream.final_size = 0; + LIST_INIT(&rs->reflist); + + LIST_APPEND(&qel->pktns->tx.frms, &ss->list); + LIST_APPEND(&qel->pktns->tx.frms, &rs->list); ret = 1; out: TRACE_LEAVE(QUIC_EV_CONN_PRSHPKT, qc); From 364f7701514de529b7774b2da001b8d196f2c12a Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Mon, 20 Feb 2023 10:32:16 +0100 Subject: [PATCH 100/140] BUG/MINOR: quic: acknowledge STREAM frame even if MUX is released When the MUX is freed, the quic-conn layer may stay active until all streams acknowledgment are processed. In this interval, if a new stream is opened by the client, the quic-conn is thus now responsible to handle it. This is done by the emission of a STOP_SENDING + RESET_STREAM. Prior to this patch, the received packet was not acknowledged. This is undesirable if the quic-conn is able to properly reject the request as this can lead to unneeded retransmission from the client. This must be backported up to 2.6. (cherry picked from commit 156a89aef8c63910502b266251dc34f648a99fae) Signed-off-by: Christopher Faulet (cherry picked from commit 49c35005d7f488524b009aa8ade7e8946660bd0b) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 95455e41e..5391c10b4 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2937,11 +2937,16 @@ static int qc_parse_pkt_frms(struct quic_conn *qc, struct quic_rx_packet *pkt, else { TRACE_DEVEL("No mux for new stream", QUIC_EV_CONN_PRSHPKT, qc); if (qc->app_ops == &h3_ops) { - if (!qc_h3_request_reject(qc, stream->id)) - TRACE_ERROR("could not enqueue STOP_SENDING frame", QUIC_EV_CONN_PRSHPKT, qc); + if (!qc_h3_request_reject(qc, stream->id)) { + TRACE_ERROR("error on request rejection", QUIC_EV_CONN_PRSHPKT, qc); + /* This packet will not be acknowledged */ + goto leave; + } + } + else { + /* This packet will not be acknowledged */ + goto leave; } - /* This packet will not be acknowledged */ - goto leave; } } From 3d41e61a486efd34e42c66182d2c3c92d30b379e Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Thu, 26 Jan 2023 16:03:45 +0100 Subject: [PATCH 101/140] BUG/MINOR: h3: prevent hypothetical demux failure on int overflow h3s stores the current demux frame type and length as a state info. It should be big enough to store a QUIC variable-length integer which is the maximum H3 frame type and size. Without this patch, there is a risk of integer overflow if H3 frame size is bigger than INT_MAX. This can typically causes demux state mismatch and demux frame error. However, no occurence has been found yet of this bug with the current implementation. This should be backported up to 2.6. (cherry picked from commit 35d9053b6832c419f9a94ff331b5c495df1cde9d) Signed-off-by: Christopher Faulet (cherry picked from commit 1ef3723c229f9a18e7ba95e2000f477d12de7fc1) [ad: remove h3 trace facility not supported on 2.6] Signed-off-by: Amaury Denoyelle --- src/h3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/h3.c b/src/h3.c index 66fc8d548..300092304 100644 --- a/src/h3.c +++ b/src/h3.c @@ -147,8 +147,8 @@ struct h3s { enum h3s_t type; enum h3s_st_req st_req; /* only used for request streams */ - int demux_frame_len; - int demux_frame_type; + uint64_t demux_frame_len; + uint64_t demux_frame_type; unsigned long long body_len; /* known request body length from content-length header if present */ unsigned long long data_len; /* total length of all parsed DATA */ From 7006bd42d583f05237159151a1ebc1db7dec37a1 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Wed, 22 Feb 2023 10:44:27 +0100 Subject: [PATCH 102/140] BUG/MEDIUM: quic: properly handle duplicated STREAM frames When a STREAM frame is re-emitted, it will point to the same stream buffer as the original one. If an ACK is received for either one of these frame, the underlying buffer may be freed. Thus, if the second frame is declared as lost and schedule for retransmission, we must ensure that the underlying buffer is still allocated or interrupt the retransmission. Stream buffer is stored as an eb_tree indexed by the stream ID. To avoid to lookup over a tree each time a STREAM frame is re-emitted, a lost STREAM frame is flagged as QUIC_FL_TX_FRAME_LOST. In most cases, this code is functional. However, there is several potential issues which may cause a segfault : - when explicitely probing with a STREAM frame, the frame won't be flagged as lost - when splitting a STREAM frame during retransmission, the flag is not copied To fix both these cases, QUIC_FL_TX_FRAME_LOST flag has been converted to a field in quic_stream structure. This field is now properly copied when splitting a STREAM frame. Also, as this is now an inner quic_frame field, it will be copied automatically on qc_frm_dup() invocation thus ensuring that it will be set on probing. This issue was encounted randomly with the following backtrace : #0 __memmove_avx512_unaligned_erms () #1 0x000055f4d5a48c01 in memcpy (__len=18446698486215405173, __src=, #2 quic_build_stream_frame (buf=0x7f6ac3fcb400, end=, frm=0x7f6a00556620, #3 0x000055f4d5a4a147 in qc_build_frm (buf=buf@entry=0x7f6ac3fcb5d8, #4 0x000055f4d5a23300 in qc_do_build_pkt (pos=, end=, #5 0x000055f4d5a25976 in qc_build_pkt (pos=0x7f6ac3fcba10, #6 0x000055f4d5a30c7e in qc_prep_app_pkts (frms=0x7f6a0032bc50, buf=0x7f6a0032bf30, #7 qc_send_app_pkts (qc=0x7f6a0032b310, frms=0x7f6a0032bc50) at src/quic_conn.c:4184 #8 0x000055f4d5a35f42 in quic_conn_app_io_cb (t=0x7f6a0009c660, context=0x7f6a0032b310, This should fix github issue #2051. This should be backported up to 2.6. (cherry picked from commit c8a0efbda86a14af38084ce85933bb691563935c) Signed-off-by: William Lallemand (cherry picked from commit 85ab1edd1549c4eb4680543d7f86c3065fbaf30e) [ad: remove block which rejects frame on too many retransmission] Signed-off-by: Amaury Denoyelle --- include/haproxy/quic_frame-t.h | 4 ++-- src/quic_conn.c | 13 +++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/include/haproxy/quic_frame-t.h b/include/haproxy/quic_frame-t.h index 7770f6187..8cfceb582 100644 --- a/include/haproxy/quic_frame-t.h +++ b/include/haproxy/quic_frame-t.h @@ -99,8 +99,6 @@ enum quic_frame_type { /* Flag a TX frame as acknowledged */ #define QUIC_FL_TX_FRAME_ACKED 0x01 -/* Flag a TX frame as lost */ -#define QUIC_FL_TX_FRAME_LOST 0x02 #define QUIC_STREAM_FRAME_TYPE_FIN_BIT 0x01 #define QUIC_STREAM_FRAME_TYPE_LEN_BIT 0x02 @@ -176,6 +174,8 @@ struct quic_stream { * for RX pointer into the packet buffer. */ const unsigned char *data; + + char dup; /* set for duplicated frame : this forces to check for the underlying qc_stream_buf instance before emitting it. */ }; struct quic_max_data { diff --git a/src/quic_conn.c b/src/quic_conn.c index 5391c10b4..2c437ff19 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -1882,12 +1882,6 @@ static inline void qc_requeue_nacked_pkt_tx_frms(struct quic_conn *qc, pool_free(pool_head_quic_frame, frm); } else { - if (QUIC_FT_STREAM_8 <= frm->type && frm->type <= QUIC_FT_STREAM_F) { - /* Mark this STREAM frame as lost. A look up their stream descriptor - * will be performed to check the stream is not consumed or released. - */ - frm->flags |= QUIC_FL_TX_FRAME_LOST; - } LIST_APPEND(&tmp, &frm->list); TRACE_DEVEL("frame requeued", QUIC_EV_CONN_PRSAFRM, qc, frm); } @@ -2505,6 +2499,8 @@ static void qc_dup_pkt_frms(struct quic_conn *qc, TRACE_DEVEL("updated partially acked frame", QUIC_EV_CONN_PRSAFRM, qc, frm); } + + strm_frm->dup = 1; break; } @@ -6747,7 +6743,7 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, break; case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: - if (cf->flags & QUIC_FL_TX_FRAME_LOST) { + if (cf->stream.dup) { struct eb64_node *node = NULL; struct qc_stream_desc *stream_desc = NULL; struct quic_stream *strm = &cf->stream; @@ -6859,7 +6855,8 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, /* FIN bit reset */ new_cf->type &= ~QUIC_STREAM_FRAME_TYPE_FIN_BIT; new_cf->stream.data = cf->stream.data; - TRACE_DEVEL("splitted frame", QUIC_EV_CONN_PRSAFRM, qc, new_cf); + new_cf->stream.dup = cf->stream.dup; + TRACE_DEVEL("split frame", QUIC_EV_CONN_PRSAFRM, qc, new_cf); if (cf->origin) { TRACE_DEVEL("duplicated frame", QUIC_EV_CONN_PRSAFRM, qc); /* This frame was duplicated */ From 51e31801a83c42bb081dbb2daded28292a1a05a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Mon, 20 Feb 2023 14:39:41 +0100 Subject: [PATCH 103/140] BUG/MINOR: quic: Do not send too small datagrams (with Initial packets) Before building a packet into a datagram, ensure there is sufficient space for at least 1200 bytes. Also pad datagrams with only one ack-eliciting Initial packet inside. Must be backported to 2.7 and 2.6. (cherry picked from commit 69e7118fe9acdad2163da0498a2173f623b74df2) Signed-off-by: William Lallemand (cherry picked from commit da21a8015f959ee1bf10c52c06f5ffe83bd8ff44) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/quic_conn.c b/src/quic_conn.c index 2c437ff19..fc254ceb7 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -3309,6 +3309,27 @@ static int qc_prep_pkts(struct quic_conn *qc, struct buffer *buf, } } + /* RFC 9000 14.1 Initial datagram size + * a server MUST expand the payload of all UDP datagrams carrying ack-eliciting + * Initial packets to at least the smallest allowed maximum datagram size of + * 1200 bytes. + * + * Ensure that no ack-eliciting packets are sent into too small datagrams + */ + if (pkt_type == QUIC_PACKET_TYPE_INITIAL && !LIST_ISEMPTY(tel_frms)) { + if (end - pos < QUIC_INITIAL_PACKET_MINLEN) { + TRACE_PROTO("No more enough room to build an Initial packets", + QUIC_EV_CONN_PHPKTS, qc); + goto out; + } + + /* Pad this Initial packet if there is no ack-eliciting frames to send from + * the next packet number space. + */ + if (LIST_ISEMPTY(next_tel_frms)) + padding = 1; + } + if (qc->negotiated_version) { ver = qc->negotiated_version; if (qel == &qc->els[QUIC_TLS_ENC_LEVEL_INITIAL]) From 7ee09e94458cd1460e6d9525603699193002f1b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Tue, 21 Feb 2023 16:44:05 +0100 Subject: [PATCH 104/140] BUG/MINOR: quic: Ensure to be able to build datagrams to be retransmitted When retransmitting datagrams with two coalesced packets inside, the second packet was not taken into consideration when checking there is enough space into the network for the datagram, especially when limited by the anti-amplification. Must be backported to 2.6 and 2.7. (cherry picked from commit d30a04a4bb4522d6ead55f2de788d9195d16df89) Signed-off-by: William Lallemand (cherry picked from commit ba731669e56d1c72de0bfe1d40de478f7605f6cd) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index fc254ceb7..0d4502aa5 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2550,7 +2550,7 @@ static void qc_prep_fast_retrans(struct quic_conn *qc, struct eb64_node *node; struct quic_tx_packet *pkt; - TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + TRACE_ENTER(QUIC_EV_CONN_SPPKTS, qc); BUG_ON(frms1 == frms2); @@ -2606,7 +2606,7 @@ static void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, struct quic_tx_packet *pkt; struct list *tmp = &itmp; - TRACE_ENTER(QUIC_EV_CONN_PRSAFRM, qc); + TRACE_ENTER(QUIC_EV_CONN_SPPKTS, qc); start: pkt = NULL; pkts = &qel->pktns->tx.pkts; @@ -2625,10 +2625,16 @@ static void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, /* When building a packet from another one, the field which may increase the * packet size is the packet number. And the maximum increase is 4 bytes. */ - if (!quic_peer_validated_addr(qc) && qc_is_listener(qc) && - pkt->len + 4 > 3 * qc->rx.bytes - qc->tx.prep_bytes) { - TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_PRSAFRM, qc); - goto end; + if (!quic_peer_validated_addr(qc) && qc_is_listener(qc)) { + size_t dglen = pkt->len + 4; + + dglen += pkt->next ? pkt->next->len + 4 : 0; + if (dglen > 3 * qc->rx.bytes - qc->tx.prep_bytes) { + TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt); + if (pkt->next) + TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt->next); + goto end; + } } qel->pktns->tx.pto_probe += 1; @@ -2642,7 +2648,7 @@ static void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, pkt = pkt->next; tmp = &htmp; hqel->pktns->tx.pto_probe += 1; - TRACE_DEVEL("looping for next packet", QUIC_EV_CONN_PRSAFRM, qc); + TRACE_DEVEL("looping for next packet", QUIC_EV_CONN_SPPKTS, qc); goto requeue; } } @@ -2651,7 +2657,7 @@ static void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, LIST_SPLICE(ifrms, &itmp); LIST_SPLICE(hfrms, &htmp); - TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); + TRACE_LEAVE(QUIC_EV_CONN_SPPKTS, qc); } static void qc_cc_err_count_inc(struct quic_conn *qc, struct quic_frame *frm) @@ -3318,7 +3324,7 @@ static int qc_prep_pkts(struct quic_conn *qc, struct buffer *buf, */ if (pkt_type == QUIC_PACKET_TYPE_INITIAL && !LIST_ISEMPTY(tel_frms)) { if (end - pos < QUIC_INITIAL_PACKET_MINLEN) { - TRACE_PROTO("No more enough room to build an Initial packets", + TRACE_PROTO("No more enough room to build an Initial packet", QUIC_EV_CONN_PHPKTS, qc); goto out; } From ee7bfd6040f2efaffeaf5f915cb30c7b7f7c1407 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Sat, 25 Feb 2023 11:27:34 +0100 Subject: [PATCH 105/140] BUG/MINOR: quic: Remove force_ack for Initial,Handshake packets This is an old bug which arrived in this commit due to a misinterpretation of the RFC I guess where the desired effect was to acknowledge all the handshake packets: 77ac6f566 BUG/MINOR: quic: Missing acknowledgments for trailing packets This had as bad effect to acknowledge all the handshake packets even the ones which are not ack-eliciting. Must be backported to 2.7 and 2.6. (cherry picked from commit b3562a38154f12da5d0015166bf965155c619bf7) Signed-off-by: William Lallemand (cherry picked from commit 4b9a3733ccc1f776b24127134c57e971e3af556d) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 0d4502aa5..5d3bfecfe 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -4057,7 +4057,7 @@ static inline int qc_treat_rx_crypto_frms(struct quic_conn *qc, * Return 1 if succeeded, 0 if not. */ int qc_treat_rx_pkts(struct quic_conn *qc, struct quic_enc_level *cur_el, - struct quic_enc_level *next_el, int force_ack) + struct quic_enc_level *next_el) { int ret = 0; struct eb64_node *node; @@ -4093,7 +4093,7 @@ int qc_treat_rx_pkts(struct quic_conn *qc, struct quic_enc_level *cur_el, else { struct quic_arng ar = { .first = pkt->pn, .last = pkt->pn }; - if (pkt->flags & QUIC_FL_RX_PACKET_ACK_ELICITING || force_ack) { + if (pkt->flags & QUIC_FL_RX_PACKET_ACK_ELICITING) { qel->pktns->flags |= QUIC_FL_PKTNS_ACK_REQUIRED; qel->pktns->rx.nb_aepkts_since_last_ack++; qc_idle_timer_rearm(qc, 1); @@ -4444,7 +4444,7 @@ struct task *quic_conn_app_io_cb(struct task *t, void *context, unsigned int sta if (!LIST_ISEMPTY(&qel->rx.pqpkts) && qc_qel_may_rm_hp(qc, qel)) qc_rm_hp_pkts(qc, qel); - if (!qc_treat_rx_pkts(qc, qel, NULL, 0)) { + if (!qc_treat_rx_pkts(qc, qel, NULL)) { TRACE_DEVEL("qc_treat_rx_pkts() failed", QUIC_EV_CONN_IO_CB, qc); goto out; } @@ -4490,7 +4490,7 @@ struct task *quic_conn_io_cb(struct task *t, void *context, unsigned int state) /* Early-data encryption level */ struct quic_enc_level *eqel; struct buffer *buf = NULL; - int st, force_ack, zero_rtt; + int st, zero_rtt; TRACE_ENTER(QUIC_EV_CONN_IO_CB, qc); eqel = &qc->els[QUIC_TLS_ENC_LEVEL_EARLY_DATA]; @@ -4541,9 +4541,7 @@ struct task *quic_conn_io_cb(struct task *t, void *context, unsigned int state) if (!LIST_ISEMPTY(&qel->rx.pqpkts) && qc_qel_may_rm_hp(qc, qel)) qc_rm_hp_pkts(qc, qel); - force_ack = qel == &qc->els[QUIC_TLS_ENC_LEVEL_INITIAL] || - qel == &qc->els[QUIC_TLS_ENC_LEVEL_HANDSHAKE]; - if (!qc_treat_rx_pkts(qc, qel, next_qel, force_ack)) + if (!qc_treat_rx_pkts(qc, qel, next_qel)) goto out; if (qc->flags & QUIC_FL_CONN_TO_KILL) { From 27bc982a4bd274ef6285e4ee7351e51f826de83b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Thu, 2 Mar 2023 11:53:43 +0100 Subject: [PATCH 106/140] BUG/MINOR: quic: Ensure not to retransmit packets with no ack-eliciting frames Even if there is a check in callers of qc_prep_hdshk_fast_retrans() and qc_prep_fast_retrans() to prevent retransmissions of packets with no ack-eliciting frames, these two functions should pay attention not do to that especially if someone decides to modify their implementations in the future. Must be backported to 2.6 and 2.7. (cherry picked from commit 21564be4a2ca209580bbe644b43e758f2536a0da) Signed-off-by: William Lallemand (cherry picked from commit 162fb779adec6e080d059923172b57a7b80ba494) [ad: adjust context due to missing traces] Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 5d3bfecfe..9bbe33858 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2558,11 +2558,15 @@ static void qc_prep_fast_retrans(struct quic_conn *qc, node = eb64_first(pkts); start: while (node) { - pkt = eb64_entry(node, struct quic_tx_packet, pn_node); + struct quic_tx_packet *p; + + p = eb64_entry(node, struct quic_tx_packet, pn_node); node = eb64_next(node); /* Skip the empty and coalesced packets */ - if (!LIST_ISEMPTY(&pkt->frms)) + if (!LIST_ISEMPTY(&p->frms)) { + pkt = p; break; + } } if (!pkt) @@ -2613,9 +2617,14 @@ static void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, node = eb64_first(pkts); /* Skip the empty packet (they have already been retransmitted) */ while (node) { - pkt = eb64_entry(node, struct quic_tx_packet, pn_node); - if (!LIST_ISEMPTY(&pkt->frms) && !(pkt->flags & QUIC_FL_TX_PACKET_COALESCED)) + struct quic_tx_packet *p; + + p = eb64_entry(node, struct quic_tx_packet, pn_node); + if (!LIST_ISEMPTY(&p->frms) && !(p->flags & QUIC_FL_TX_PACKET_COALESCED)) { + pkt = p; break; + } + node = eb64_next(node); } From bbdbacc27a64d0eaa712c6be6bc0b3e559536eff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Thu, 2 Mar 2023 14:49:22 +0100 Subject: [PATCH 107/140] BUG/MINOR: quic: Do not resend already acked frames Some frames are marked as already acknowledged from duplicated packets whose the original packet has been acknowledged. There is no need to resend such packets or frames. Implement qc_pkt_with_only_acked_frms() to detect packet with only already acknowledged frames inside and use it from qc_prep_fast_retrans() which selects the packet to be retransmitted. Must be backported to 2.6 and 2.7. (cherry picked from commit e6359b649b7a172a4271ddef4daee2dfae7cbae1) Signed-off-by: William Lallemand (cherry picked from commit b21ed518d576e1ed64909739a3735b167c9d320b) [ad: adjust context due to missing traces] Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 9bbe33858..ddb6b42c3 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2474,6 +2474,11 @@ static void qc_dup_pkt_frms(struct quic_conn *qc, list_for_each_entry_safe(frm, frmbak, pkt_frm_list, list) { struct quic_frame *dup_frm, *origin; + if (frm->flags & QUIC_FL_TX_FRAME_ACKED) { + TRACE_DEVEL("already acknowledged frame", QUIC_EV_CONN_PRSAFRM, qc, frm); + continue; + } + switch (frm->type) { case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: { @@ -2540,6 +2545,20 @@ static void qc_dup_pkt_frms(struct quic_conn *qc, TRACE_LEAVE(QUIC_EV_CONN_PRSAFRM, qc); } +/* Boolean function which return 1 if TX packet is only made of + * already acknowledged frame. + */ +static inline int qc_pkt_with_only_acked_frms(struct quic_tx_packet *pkt) +{ + struct quic_frame *frm; + + list_for_each_entry(frm, &pkt->frms, list) + if (!(frm->flags & QUIC_FL_TX_FRAME_ACKED)) + return 0; + + return 1; +} + /* Prepare a fast retransmission from encryption level */ static void qc_prep_fast_retrans(struct quic_conn *qc, struct quic_enc_level *qel, @@ -2563,7 +2582,7 @@ static void qc_prep_fast_retrans(struct quic_conn *qc, p = eb64_entry(node, struct quic_tx_packet, pn_node); node = eb64_next(node); /* Skip the empty and coalesced packets */ - if (!LIST_ISEMPTY(&p->frms)) { + if (!LIST_ISEMPTY(&p->frms) && !qc_pkt_with_only_acked_frms(p)) { pkt = p; break; } @@ -2620,7 +2639,8 @@ static void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, struct quic_tx_packet *p; p = eb64_entry(node, struct quic_tx_packet, pn_node); - if (!LIST_ISEMPTY(&p->frms) && !(p->flags & QUIC_FL_TX_PACKET_COALESCED)) { + if (!LIST_ISEMPTY(&p->frms) && !(p->flags & QUIC_FL_TX_PACKET_COALESCED) && + !qc_pkt_with_only_acked_frms(p)) { pkt = p; break; } From 5285f50a6368c1a1e0f5c2d178b8096dd5c9c55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Wed, 8 Feb 2023 16:08:28 +0100 Subject: [PATCH 108/140] MINOR: quic: Move code to wakeup the timer task to avoid anti-amplication deadlock This code was there because the timer task was not running on the same thread as the one which parse the QUIC packets. Now that this is no more the case, we can wake up this task directly. Must be backported to 2.7. (cherry picked from commit 75c8ad549002b01de0b0d139a26b356973af57a9) Signed-off-by: Christopher Faulet (cherry picked from commit 367c06096b2338c46bb9bec273c3bcb80d983ee7) [ad: taken on 2.6 to facilitate next cherry-pick] Signed-off-by: Amaury Denoyelle --- include/haproxy/quic_conn-t.h | 1 - src/quic_conn.c | 27 +++++++-------------------- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/include/haproxy/quic_conn-t.h b/include/haproxy/quic_conn-t.h index 7e8634893..6481dbdfa 100644 --- a/include/haproxy/quic_conn-t.h +++ b/include/haproxy/quic_conn-t.h @@ -604,7 +604,6 @@ enum qc_mux_state { /* Flags at connection level */ #define QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED (1U << 0) -#define QUIC_FL_CONN_IO_CB_WAKEUP (1U << 1) #define QUIC_FL_CONN_POST_HANDSHAKE_FRAMES_BUILT (1U << 2) #define QUIC_FL_CONN_LISTENER (1U << 3) #define QUIC_FL_CONN_ACCEPT_REGISTERED (1U << 4) diff --git a/src/quic_conn.c b/src/quic_conn.c index ddb6b42c3..927491ab0 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -4533,20 +4533,6 @@ struct task *quic_conn_io_cb(struct task *t, void *context, unsigned int state) qc_dgrams_retransmit(qc); } - if (qc->flags & QUIC_FL_CONN_IO_CB_WAKEUP) { - qc->flags &= ~QUIC_FL_CONN_IO_CB_WAKEUP; - TRACE_DEVEL("needs to wakeup the timer task after the anti-amplicaiton limit was reached", - QUIC_EV_CONN_IO_CB, qc); - /* The I/O handler has been woken up by the dgram parser (qc_lstnr_pkt_rcv()) - * after the anti-amplification was reached. - * - * TODO: this part should be removed. This was there because the - * datagram parser was not executed by only one thread. - */ - qc_set_timer(qc); - if (qc->timer_task && tick_isset(qc->timer) && tick_is_lt(qc->timer, now_ms)) - task_wakeup(qc->timer_task, TASK_WOKEN_MSG); - } ssl_err = SSL_ERROR_NONE; zero_rtt = st < QUIC_HS_ST_COMPLETE && quic_tls_has_rx_sec(eqel) && @@ -6494,7 +6480,6 @@ static void qc_rx_pkt_handle(struct quic_conn *qc, struct quic_rx_packet *pkt, * when sending the next packet if reached again. */ qc->flags &= ~QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; - qc->flags |= QUIC_FL_CONN_IO_CB_WAKEUP; io_cb_wakeup = 1; } @@ -6548,11 +6533,13 @@ static void qc_rx_pkt_handle(struct quic_conn *qc, struct quic_rx_packet *pkt, drop: HA_ATOMIC_INC(&qc->prx_counters->dropped_pkt); - /* Wakeup the I/O handler callback if the PTO timer must be armed. - * This cannot be done by this thread. - */ - if (io_cb_wakeup) - tasklet_wakeup(qc->wait_event.tasklet); + if (io_cb_wakeup) { + TRACE_DEVEL("needs to wakeup the timer task after the amplification limit was reached", + QUIC_EV_CONN_LPKT, qc); + qc_set_timer(qc); + if (qc->timer_task && tick_isset(qc->timer) && tick_is_lt(qc->timer, now_ms)) + task_wakeup(qc->timer_task, TASK_WOKEN_MSG); + } TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc ? qc : NULL, pkt, NULL, qv); } From 7cb8cfaaaaf143d38ca339c799605b7873da54aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Fri, 3 Mar 2023 10:16:32 +0100 Subject: [PATCH 109/140] BUG/MINOR: quic: Missing detections of amplification limit reached Mark the connection as limited by the anti-amplification limit when trying to probe the peer. Wakeup the connection PTO/dectection loss timer as soon as a datagram is received. This was done only when the datagram was dropped. This fixes deadlock issues revealed by some interop runner tests. Must be backported to 2.7 and 2.6. (cherry picked from commit a65b71f89f4dec6cd65aa71403349e25fed6bc51) Signed-off-by: William Lallemand (cherry picked from commit aa6b4c9325e04cdc7ca68f5b4b190c2d8ce59d72) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 927491ab0..989e899de 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2596,6 +2596,7 @@ static void qc_prep_fast_retrans(struct quic_conn *qc, */ if (!quic_peer_validated_addr(qc) && qc_is_listener(qc) && pkt->len + 4 > 3 * qc->rx.bytes - qc->tx.prep_bytes) { + qc->flags |= QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt); goto leave; } @@ -2659,6 +2660,7 @@ static void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, dglen += pkt->next ? pkt->next->len + 4 : 0; if (dglen > 3 * qc->rx.bytes - qc->tx.prep_bytes) { + qc->flags |= QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt); if (pkt->next) TRACE_PROTO("anti-amplification limit would be reached", QUIC_EV_CONN_SPPKTS, qc, pkt->next); @@ -6469,18 +6471,20 @@ static void qc_rx_pkt_handle(struct quic_conn *qc, struct quic_rx_packet *pkt, const struct quic_version *qv = pkt->version; struct quic_enc_level *qel = NULL; size_t b_cspace; - int io_cb_wakeup = 0; if (pkt->flags & QUIC_FL_RX_PACKET_DGRAM_FIRST && - !quic_peer_validated_addr(qc) && qc->flags & QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED) { TRACE_PROTO("PTO timer must be armed after anti-amplication was reached", QUIC_EV_CONN_LPKT, qc, NULL, NULL, qv); + TRACE_DEVEL("needs to wakeup the timer task after the amplification limit was reached", + QUIC_EV_CONN_LPKT, qc); /* Reset the anti-amplification bit. It will be set again * when sending the next packet if reached again. */ qc->flags &= ~QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED; - io_cb_wakeup = 1; + qc_set_timer(qc); + if (qc->timer_task && tick_isset(qc->timer) && tick_is_lt(qc->timer, now_ms)) + task_wakeup(qc->timer_task, TASK_WOKEN_MSG); } if (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) { @@ -6533,14 +6537,6 @@ static void qc_rx_pkt_handle(struct quic_conn *qc, struct quic_rx_packet *pkt, drop: HA_ATOMIC_INC(&qc->prx_counters->dropped_pkt); - if (io_cb_wakeup) { - TRACE_DEVEL("needs to wakeup the timer task after the amplification limit was reached", - QUIC_EV_CONN_LPKT, qc); - qc_set_timer(qc); - if (qc->timer_task && tick_isset(qc->timer) && tick_is_lt(qc->timer, now_ms)) - task_wakeup(qc->timer_task, TASK_WOKEN_MSG); - } - TRACE_LEAVE(QUIC_EV_CONN_LPKT, qc ? qc : NULL, pkt, NULL, qv); } From 0768bcee234ce971d8b92733e34378c6b8083516 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Mon, 6 Mar 2023 09:10:53 +0100 Subject: [PATCH 110/140] BUG/MEDIUM: quic: do not crash when handling STREAM on released MUX The MUX instance is released before its quic-conn counterpart. On termination, a H3 GOAWAY is emitted to prevent the client to open new streams for this connection. The quic-conn instance will stay alive until all opened streams data are acknowledged. If the client tries to open a new stream during this interval despite the GOAWAY, quic-conn is responsible to request its immediate closure with a STOP_SENDING + RESET_STREAM. This behavior was already implemented but the received packet with the new STREAM was never acknowledged. This was fixed with the following commit : commit 156a89aef8c63910502b266251dc34f648a99fae BUG/MINOR: quic: acknowledge STREAM frame even if MUX is released However, this patch introduces a regression as it did not skip the call to qc_handle_strm_frm() despite the MUX instance being released. This can cause a segfault when using qcc_get_qcs() on a released MUX instance. To fix this, add a missing break statement which will skip qc_handle_strm_frm() when the MUX instance is not initialized. This commit was reproduced using a short timeout client and sending several requests with delay between them by using a modified aioquic. It produces a crash with the following backtrace : #0 0x000055555594d261 in __eb64_lookup (x=4, root=0x7ffff4091f60) at include/import/eb64tree.h:132 #1 eb64_lookup (root=0x7ffff4091f60, x=4) at src/eb64tree.c:37 #2 0x000055555563fc66 in qcc_get_qcs (qcc=0x7ffff4091dc0, id=4, receive_only=1, send_only=0, out=0x7ffff780ca70) at src/mux_quic.c:668 #3 0x0000555555641e1a in qcc_recv (qcc=0x7ffff4091dc0, id=4, len=40, offset=0, fin=1 '\001', data=0x7ffff40c4fef "\001&") at src/mux_quic.c:974 #4 0x0000555555619d28 in qc_handle_strm_frm (pkt=0x7ffff4088e60, strm_frm=0x7ffff780cf50, qc=0x7ffff7cef000, fin=1 '\001') at src/quic_conn.c:2515 #5 0x000055555561d677 in qc_parse_pkt_frms (qc=0x7ffff7cef000, pkt=0x7ffff4088e60, qel=0x7ffff7cef6c0) at src/quic_conn.c:3050 #6 0x00005555556230aa in qc_treat_rx_pkts (qc=0x7ffff7cef000, cur_el=0x7ffff7cef6c0, next_el=0x0) at src/quic_conn.c:4214 #7 0x0000555555625fee in quic_conn_app_io_cb (t=0x7ffff40c1fa0, context=0x7ffff7cef000, state=32848) at src/quic_conn.c:4640 #8 0x00005555558a676d in run_tasks_from_lists (budgets=0x7ffff780d470) at src/task.c:596 #9 0x00005555558a725b in process_runnable_tasks () at src/task.c:876 #10 0x00005555558522ba in run_poll_loop () at src/haproxy.c:2945 #11 0x00005555558529ac in run_thread_poll_loop (data=0x555555d14440 ) at src/haproxy.c:3141 #12 0x00007ffff789ebb5 in ?? () from /usr/lib/libc.so.6 #13 0x00007ffff7920d90 in ?? () from /usr/lib/libc.so.6 This should fix github issue #2067. This must be backported up to 2.6. (cherry picked from commit 315a4f6ae54da17fd28f7a14373b05bab0b5aa08) Signed-off-by: William Lallemand (cherry picked from commit 1103d1d979783a4f00d2fa7cac7b9d2abfaceed5) Signed-off-by: Amaury Denoyelle --- src/quic_conn.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/quic_conn.c b/src/quic_conn.c index 989e899de..256cd0168 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -2965,7 +2965,6 @@ static int qc_parse_pkt_frms(struct quic_conn *qc, struct quic_rx_packet *pkt, if (qc->mux_state != QC_MUX_READY) { if ((stream->id >> QCS_ID_TYPE_SHIFT) < nb_streams) { TRACE_DATA("Already closed stream", QUIC_EV_CONN_PRSHPKT, qc); - break; } else { TRACE_DEVEL("No mux for new stream", QUIC_EV_CONN_PRSHPKT, qc); @@ -2981,6 +2980,8 @@ static int qc_parse_pkt_frms(struct quic_conn *qc, struct quic_rx_packet *pkt, goto leave; } } + + break; } if (!qc_handle_strm_frm(pkt, stream, qc)) { From 18b2c3de379cd1a61785bbbad08450489ddcbb58 Mon Sep 17 00:00:00 2001 From: Amaury Denoyelle Date: Tue, 7 Mar 2023 18:07:08 +0100 Subject: [PATCH 111/140] BUG/MINOR: mux-quic: properly init STREAM frame as not duplicated STREAM frame retransmission has been recently fixed. A new boolean field was created for quic_stream frame type. It is set for duplicated STREAM frame to ensure extra checks on the underlying buffer are conducted before sending the frame. All of this has been implemented by this commit : 315a4f6ae54da17fd28f7a14373b05bab0b5aa08 BUG/MEDIUM: quic: do not crash when handling STREAM on released MUX However, the above commit is incomplete. In the MUX code, when a new STREAM frame is created, is left uninitialized. In most cases this is harmless as it will only add extra unneeded checks before sending the frame. So this is mainly a performance issue. There is however one case where this bug will lead to a crash : when the response consists only of an empty STREAM frame. In this case, the empty frame will be silently removed as it is incorrectly assimilated to an already acked frame range in qc_build_frms(). This can trigger a BUG_ON() on the MUX code as a qcs instance is still in the send list after qc_send_frames() invocation. Note that this is extremely rare to have only an empty STREAM frame. It was reproduced with HTTP/0.9 where no HTTP status line exists on an empty body. I do not know if this is possible on HTTP/3 as a status line should be present each time in a HEADERS frame. Properly initialize field to 0 on each STREAM frames generated by the QUIC MUX to fix this issue. This crash may be linked to github issue #2049. This should be backported up to 2.6. (cherry picked from commit ebfafc212a88ce7fdc46a135c0d585d55f2c564d) Signed-off-by: Willy Tarreau (cherry picked from commit 301724cc4c4ad4593c18458c793cb6598b33b6d2) [ad: adjusted context] Signed-off-by: Amaury Denoyelle --- src/mux_quic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mux_quic.c b/src/mux_quic.c index ee035bbd6..49df0dc41 100644 --- a/src/mux_quic.c +++ b/src/mux_quic.c @@ -1301,6 +1301,7 @@ static int qcs_build_stream_frm(struct qcs *qcs, struct buffer *out, char fin, frm->stream.id = qcs->id; frm->stream.buf = out; frm->stream.data = (unsigned char *)b_peek(out, head); + frm->stream.dup = 0; /* FIN is positioned only when the buffer has been totally emptied. */ if (fin) From 8104f11eaf28b2a5bd4133c3d539f7113e00f5b0 Mon Sep 17 00:00:00 2001 From: William Lallemand Date: Thu, 9 Mar 2023 14:28:44 +0100 Subject: [PATCH 112/140] BUG/MINOR: mworker: use MASTER_MAXCONN as default maxconn value In environments where SYSTEM_MAXCONN is defined when compiling, the master will use this value instead of the original minimal value which was set to 100. When this happens, the master process could allocate RAM excessively since it does not need to have an high maxconn. (For example if SYSTEM_MAXCONN was set to 100000 or more) This patch fixes the issue by using the new define MASTER_MAXCONN which define a default maxconn of 100 for the master process. Must be backported as far as 2.5. (cherry picked from commit 2078d4b1f77f70ac110022c2b5ac4644e4c01640) Signed-off-by: Willy Tarreau (cherry picked from commit 585a6cb8ae836094df5ab4944607349866028ca1) Signed-off-by: Willy Tarreau --- include/haproxy/defaults.h | 10 ++++++++++ src/haproxy.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h index 80b2e6431..5fcd4ba98 100644 --- a/include/haproxy/defaults.h +++ b/include/haproxy/defaults.h @@ -269,6 +269,16 @@ #define DEFAULT_MAXCONN 100 #endif +/* Define a maxconn which will be used in the master process once it re-exec to + * the MODE_MWORKER_WAIT and won't change when SYSTEM_MAXCONN is set. + * + * 100 must be enough for the master since it only does communication between + * the master and the workers, and the master CLI. + */ +#ifndef MASTER_MAXCONN +#define MASTER_MAXCONN 100 +#endif + /* Minimum check interval for spread health checks. Servers with intervals * greater than or equal to this value will have their checks spread apart * and will be considered when searching the minimal interval. diff --git a/src/haproxy.c b/src/haproxy.c index 4fcc74963..7f59af646 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -2291,7 +2291,7 @@ static void init(int argc, char **argv) /* set the default maxconn in the master, but let it be rewritable with -n */ if (global.mode & MODE_MWORKER_WAIT) - global.maxconn = DEFAULT_MAXCONN; + global.maxconn = MASTER_MAXCONN; if (cfg_maxconn > 0) global.maxconn = cfg_maxconn; From 2b6d33435e8b4007e9bb63f55732530ccee4ffcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=E9d=E9ric=20L=E9caille?= Date: Fri, 10 Mar 2023 13:34:30 +0100 Subject: [PATCH 113/140] BUG/MINOR: quic: Missing listener accept queue tasklet wakeups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This bug was revealed by h2load tests run as follows: h2load -t 4 --npn-list h3 -c 64 -m 16 -n 16384 -v https://127.0.0.1:4443/ This open (-c) 64 QUIC connections (-n) 16384 h3 requets from (-t) 4 threads, i.e. 256 requests by connection. Such tests could not always pass and often ended with such results displays by h2load: finished in 53.74s, 38.11 req/s, 493.78KB/s requests: 16384 total, 2944 started, 2048 done, 2048 succeeded, 14336 failed, 14336 errored, 0 timeout status codes: 2048 2xx, 0 3xx, 0 4xx, 0 5xx traffic: 25.92MB (27174537) total, 102.00KB (104448) headers (space savings 1.92%), 25.80MB (27053569) data UDP datagram: 3883 sent, 24330 received min max mean sd ± sd time for request: 48.75ms 502.86ms 134.12ms 75.80ms 92.68% time for connect: 20.94ms 331.24ms 189.59ms 84.81ms 59.38% time to 1st byte: 394.36ms 417.01ms 406.72ms 9.14ms 75.00% req/s : 0.00 115.45 14.30 38.13 87.50% The number of successful requests was always a multiple of 256. Activating the traces also shew that some connections were blocked after having successfully completed their handshakes due to the fact that the mux. The mux is started upon the acceptation of the connection. Under heavy load, some connections were never accepted. From the moment where more than 4 (MAXACCEPT) connections were enqueued before a listener could be woken up to accept at most 4 connections, the remaining connections were not accepted ore lately at the second listener tasklet wakeup. Add a call to tasklet_wakeup() to the accept list tasklet of the listeners to wake up it if there are remaining connections to accept after having called listener_accept(). In this case the listener must not be removed of this accept list, if not at the next call it will not accept anything more. Must be backported to 2.7 and 2.6. (cherry picked from commit 4377dbd756efa1645106b0e53d3ddaba9a6f0702) Signed-off-by: Willy Tarreau (cherry picked from commit 9cc091cb29114d71a30e703aa895a6fa9658336b) Signed-off-by: Willy Tarreau --- src/quic_sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/quic_sock.c b/src/quic_sock.c index f096ca3e6..11008fb41 100644 --- a/src/quic_sock.c +++ b/src/quic_sock.c @@ -542,7 +542,10 @@ static struct task *quic_accept_run(struct task *t, void *ctx, unsigned int i) mt_list_for_each_entry_safe(lthr, &queue->listeners, quic_accept.list, elt1, elt2) { listener_accept(lthr->li); - MT_LIST_DELETE_SAFE(elt1); + if (!MT_LIST_ISEMPTY(<hr->quic_accept.conns)) + tasklet_wakeup((struct tasklet*)t); + else + MT_LIST_DELETE_SAFE(elt1); } return NULL; From fe7d0a810b76429ae9ed449fba7c72529c2e87e0 Mon Sep 17 00:00:00 2001 From: Michael Prokop Date: Fri, 9 Dec 2022 12:28:46 +0100 Subject: [PATCH 114/140] DOC/CLEANUP: fix typos s/algorithmm/algorithm/ s/an other/another/ s/certicates/certificates/ s/exemples/examples/ s/informations/information/ s/optionnal/optional/ (cherry picked from commit 9a62e35e371f0f39737a1ea2fe47f2a7e24824b1) Signed-off-by: Willy Tarreau (cherry picked from commit 4fc20a88fa6451926f5e798d3931b3b4549debdd) [wt: dropped changes to missing parts] Signed-off-by: Willy Tarreau --- doc/design-thoughts/config-language.txt | 4 ++-- doc/internals/http-parsing.txt | 4 ++-- doc/management.txt | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/design-thoughts/config-language.txt b/doc/design-thoughts/config-language.txt index 510ada68e..20c4fbd2b 100644 --- a/doc/design-thoughts/config-language.txt +++ b/doc/design-thoughts/config-language.txt @@ -24,9 +24,9 @@ Pour les filtres : = [ == | =~ | =* | =^ | =/ | != | !~ | !* | !^ | !/ ] = "" = [ allow | permit | deny | delete | replace | switch | add | set | redir ] - = optionnal action args + = optional action args - exemples: + examples: req in URI =^ "/images" switch images req in h(host) =* ".mydomain.com" switch mydomain diff --git a/doc/internals/http-parsing.txt b/doc/internals/http-parsing.txt index 494558baa..8b3f23960 100644 --- a/doc/internals/http-parsing.txt +++ b/doc/internals/http-parsing.txt @@ -325,11 +325,11 @@ Unfortunately, some products such as Apache allow such characters :-/ - each http_txn has 1 request message (http_req), and 0 or 1 response message (http_rtr). Each of them has 1 and only one http_txn. An http_txn holds - informations such as the HTTP method, the URI, the HTTP version, the + information such as the HTTP method, the URI, the HTTP version, the transfer-encoding, the HTTP status, the authorization, the req and rtr content-length, the timers, logs, etc... The backend and server which process the request are also known from the http_txn. -- both request and response messages hold header and parsing informations, such +- both request and response messages hold header and parsing information, such as the parsing state, start of headers, start of message, captures, etc... diff --git a/doc/management.txt b/doc/management.txt index a6afa9d58..dfbcca239 100644 --- a/doc/management.txt +++ b/doc/management.txt @@ -2879,7 +2879,7 @@ show resolvers [] other: any other DNS errors invalid: invalid DNS response (from a protocol point of view) too_big: too big response - outdated: number of response arrived too late (after an other name server) + outdated: number of response arrived too late (after another name server) show servers conn [] Dump the current and idle connections state of the servers belonging to the From b08ef7cf633a554b136aa0fb7a7d654c41dbe454 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 10 Mar 2023 16:29:22 +0100 Subject: [PATCH 115/140] [RELEASE] Released version 2.6.10 Released version 2.6.10 with the following main changes : - BUG/MINOR: mworker: stop doing strtok directly from the env - BUG/MEDIUM: mworker: prevent inconsistent reload when upgrading from old versions - BUG/MEDIUM: mworker: don't register mworker_accept_wrapper() when master FD is wrong - MINOR: startup: HAPROXY_STARTUP_VERSION contains the version used to start - BUG/MINOR: lua/httpclient: missing free in hlua_httpclient_send() - BUG/MEDIUM: httpclient/lua: fix a race between lua GC and hlua_ctx_destroy - MINOR: fd/cli: report the polling mask in "show fd" - BUG/MEDIUM: stconn: Don't rearm the read expiration date if EOI was reached - BUG/MINOR: sched: properly report long_rq when tasks remain in the queue - BUG/MEDIUM: sched: allow a bit more TASK_HEAVY to be processed when needed - BUG/MINOR: mworker: prevent incorrect values in uptime - MINOR: mux-h2/traces: do not log h2s pointer for dummy streams - MINOR: mux-h2/traces: add a missing TRACE_LEAVE() in h2s_frt_handle_headers() - REGTESTS: Fix ssl_errors.vtc script to wait for connections close - BUG/MINOR: cache: Cache response even if request has "no-cache" directive - BUG/MINOR: cache: Check cache entry is complete in case of Vary - BUG/MINOR: ring: do not realign ring contents on resize - BUILD: thead: Fix several 32 bits compilation issues with uint64_t variables - BUG/MEDIUM: h1-htx: Never copy more than the max data allowed during parsing - DOC: config: Fix description of options about HTTP connection modes - DOC: config: Add the missing tune.fail-alloc option from global listing - DOC: config: Clarify the meaning of 'hold' in the 'resolvers' section - BUG/MEDIUM: connection: Clear flags when a conn is removed from an idle list - BUG/MINOR: http-check: Don't set HTX_SL_F_BODYLESS flag with a log-format body - BUG/MINOR: http-check: Skip C-L header for empty body when it's not mandatory - BUG/MINOR: http-ana: Don't increment conn_retries counter before the L7 retry - BUG/MINOR: http-ana: Do a L7 retry on read error if there is no response - BUG/MINOR: ssl: Use 'date' instead of 'now' in ocsp stapling callback - MINOR: ssl: rename confusing ssl_bind_kws - BUG/MINOR: config: crt-list keywords mistaken for bind ssl keywords - BUG/MINOR: init: properly detect NUMA bindings on large systems - BUG/MEDIUM: master: force the thread count earlier - BUG/MINOR: init: make sure to always limit the total number of threads - BUG/MINOR: thread: report thread and group counts in the correct order - BUG/MINOR: ring: release the backing store name on exit - MEDIUM: epoll: don't synchronously delete migrated FDs - MEDIUM: poller: program the update in fd_update_events() for a migrated FD - MAJOR: fd: remove pending updates upon real close - MINOR: fd: delete unused updates on close() - MEDIUM: fd: add the tgid to the fd and pass it to fd_insert() - MINOR: cli/fd: show fd's tgid and refcount in "show fd" - MINOR: fd: add functions to manipulate the FD's tgid - MINOR: fd: add fd_get_running() to atomically return the running mask - MAJOR: fd: grab the tgid before manipulating running - MINOR: fd: make fd_clr_running() return the previous value instead - MEDIUM: fd: make fd_insert/fd_delete atomically update fd.tgid - BUG/MINOR: fd: Properly init the fd state in fd_insert() - MEDIUM: fd: quit fd_update_events() when FD is closed - MAJOR: poller: only touch/inspect the update_mask under tgid protection - MEDIUM: fd: support broadcasting updates for foreign groups in updt_fd_polling - BUG/MAJOR: fd/thread: fix race between updates and closing FD - BUG/MAJOR: fd/threads: close a race on closing connections after takeover - MINOR: h3/hq-interop: handle no data in decode_qcs() with FIN set - BUG/MINOR: mux-quic: transfer FIN on empty STREAM frame - BUG/MINOR: quic: Possible unexpected counter incrementation on send*() errors - BUG/MINOR: quic: Really cancel the connection timer from qc_set_timer() - BUG/MINOR: quic: Missing call to task_queue() in qc_idle_timer_do_rearm() - BUG/MINOR: quic: Do not probe with too little Initial packets - BUG/MINOR: quic: Wrong initialization for io_cb_wakeup boolean - BUG/MINOR: quic: Do not drop too small datagrams with Initial packets - BUG/MINOR: quic: Missing padding for short packets - MINOR: quic: adjust request reject when MUX is already freed - BUG/MINOR: quic: also send RESET_STREAM if MUX released - BUG/MINOR: quic: acknowledge STREAM frame even if MUX is released - BUG/MINOR: h3: prevent hypothetical demux failure on int overflow - BUG/MEDIUM: quic: properly handle duplicated STREAM frames - BUG/MINOR: quic: Do not send too small datagrams (with Initial packets) - BUG/MINOR: quic: Ensure to be able to build datagrams to be retransmitted - BUG/MINOR: quic: Remove force_ack for Initial,Handshake packets - BUG/MINOR: quic: Ensure not to retransmit packets with no ack-eliciting frames - BUG/MINOR: quic: Do not resend already acked frames - MINOR: quic: Move code to wakeup the timer task to avoid anti-amplication deadlock - BUG/MINOR: quic: Missing detections of amplification limit reached - BUG/MEDIUM: quic: do not crash when handling STREAM on released MUX - BUG/MINOR: mux-quic: properly init STREAM frame as not duplicated - BUG/MINOR: mworker: use MASTER_MAXCONN as default maxconn value - BUG/MINOR: quic: Missing listener accept queue tasklet wakeups - DOC/CLEANUP: fix typos --- CHANGELOG | 80 +++++++++++++++++++++++++++++++++++++++++++ VERDATE | 2 +- VERSION | 2 +- doc/configuration.txt | 2 +- 4 files changed, 83 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 362b9fcfd..e89a3faa7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,86 @@ ChangeLog : =========== +2023/03/10 : 2.6.10 + - BUG/MINOR: mworker: stop doing strtok directly from the env + - BUG/MEDIUM: mworker: prevent inconsistent reload when upgrading from old versions + - BUG/MEDIUM: mworker: don't register mworker_accept_wrapper() when master FD is wrong + - MINOR: startup: HAPROXY_STARTUP_VERSION contains the version used to start + - BUG/MINOR: lua/httpclient: missing free in hlua_httpclient_send() + - BUG/MEDIUM: httpclient/lua: fix a race between lua GC and hlua_ctx_destroy + - MINOR: fd/cli: report the polling mask in "show fd" + - BUG/MEDIUM: stconn: Don't rearm the read expiration date if EOI was reached + - BUG/MINOR: sched: properly report long_rq when tasks remain in the queue + - BUG/MEDIUM: sched: allow a bit more TASK_HEAVY to be processed when needed + - BUG/MINOR: mworker: prevent incorrect values in uptime + - MINOR: mux-h2/traces: do not log h2s pointer for dummy streams + - MINOR: mux-h2/traces: add a missing TRACE_LEAVE() in h2s_frt_handle_headers() + - REGTESTS: Fix ssl_errors.vtc script to wait for connections close + - BUG/MINOR: cache: Cache response even if request has "no-cache" directive + - BUG/MINOR: cache: Check cache entry is complete in case of Vary + - BUG/MINOR: ring: do not realign ring contents on resize + - BUILD: thead: Fix several 32 bits compilation issues with uint64_t variables + - BUG/MEDIUM: h1-htx: Never copy more than the max data allowed during parsing + - DOC: config: Fix description of options about HTTP connection modes + - DOC: config: Add the missing tune.fail-alloc option from global listing + - DOC: config: Clarify the meaning of 'hold' in the 'resolvers' section + - BUG/MEDIUM: connection: Clear flags when a conn is removed from an idle list + - BUG/MINOR: http-check: Don't set HTX_SL_F_BODYLESS flag with a log-format body + - BUG/MINOR: http-check: Skip C-L header for empty body when it's not mandatory + - BUG/MINOR: http-ana: Don't increment conn_retries counter before the L7 retry + - BUG/MINOR: http-ana: Do a L7 retry on read error if there is no response + - BUG/MINOR: ssl: Use 'date' instead of 'now' in ocsp stapling callback + - MINOR: ssl: rename confusing ssl_bind_kws + - BUG/MINOR: config: crt-list keywords mistaken for bind ssl keywords + - BUG/MINOR: init: properly detect NUMA bindings on large systems + - BUG/MEDIUM: master: force the thread count earlier + - BUG/MINOR: init: make sure to always limit the total number of threads + - BUG/MINOR: thread: report thread and group counts in the correct order + - BUG/MINOR: ring: release the backing store name on exit + - MEDIUM: epoll: don't synchronously delete migrated FDs + - MEDIUM: poller: program the update in fd_update_events() for a migrated FD + - MAJOR: fd: remove pending updates upon real close + - MINOR: fd: delete unused updates on close() + - MEDIUM: fd: add the tgid to the fd and pass it to fd_insert() + - MINOR: cli/fd: show fd's tgid and refcount in "show fd" + - MINOR: fd: add functions to manipulate the FD's tgid + - MINOR: fd: add fd_get_running() to atomically return the running mask + - MAJOR: fd: grab the tgid before manipulating running + - MINOR: fd: make fd_clr_running() return the previous value instead + - MEDIUM: fd: make fd_insert/fd_delete atomically update fd.tgid + - BUG/MINOR: fd: Properly init the fd state in fd_insert() + - MEDIUM: fd: quit fd_update_events() when FD is closed + - MAJOR: poller: only touch/inspect the update_mask under tgid protection + - MEDIUM: fd: support broadcasting updates for foreign groups in updt_fd_polling + - BUG/MAJOR: fd/thread: fix race between updates and closing FD + - BUG/MAJOR: fd/threads: close a race on closing connections after takeover + - MINOR: h3/hq-interop: handle no data in decode_qcs() with FIN set + - BUG/MINOR: mux-quic: transfer FIN on empty STREAM frame + - BUG/MINOR: quic: Possible unexpected counter incrementation on send*() errors + - BUG/MINOR: quic: Really cancel the connection timer from qc_set_timer() + - BUG/MINOR: quic: Missing call to task_queue() in qc_idle_timer_do_rearm() + - BUG/MINOR: quic: Do not probe with too little Initial packets + - BUG/MINOR: quic: Wrong initialization for io_cb_wakeup boolean + - BUG/MINOR: quic: Do not drop too small datagrams with Initial packets + - BUG/MINOR: quic: Missing padding for short packets + - MINOR: quic: adjust request reject when MUX is already freed + - BUG/MINOR: quic: also send RESET_STREAM if MUX released + - BUG/MINOR: quic: acknowledge STREAM frame even if MUX is released + - BUG/MINOR: h3: prevent hypothetical demux failure on int overflow + - BUG/MEDIUM: quic: properly handle duplicated STREAM frames + - BUG/MINOR: quic: Do not send too small datagrams (with Initial packets) + - BUG/MINOR: quic: Ensure to be able to build datagrams to be retransmitted + - BUG/MINOR: quic: Remove force_ack for Initial,Handshake packets + - BUG/MINOR: quic: Ensure not to retransmit packets with no ack-eliciting frames + - BUG/MINOR: quic: Do not resend already acked frames + - MINOR: quic: Move code to wakeup the timer task to avoid anti-amplication deadlock + - BUG/MINOR: quic: Missing detections of amplification limit reached + - BUG/MEDIUM: quic: do not crash when handling STREAM on released MUX + - BUG/MINOR: mux-quic: properly init STREAM frame as not duplicated + - BUG/MINOR: mworker: use MASTER_MAXCONN as default maxconn value + - BUG/MINOR: quic: Missing listener accept queue tasklet wakeups + - DOC/CLEANUP: fix typos + 2023/02/14 : 2.6.9 - BUG/MINOR: sink: make sure to always properly unmap a file-backed ring - DEV: haring: add a new option "-r" to automatically repair broken files diff --git a/VERDATE b/VERDATE index b0f52f409..941e9fc00 100644 --- a/VERDATE +++ b/VERDATE @@ -1,2 +1,2 @@ $Format:%ci$ -2023/02/14 +2023/03/10 diff --git a/VERSION b/VERSION index d48d3702a..a04abec91 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.9 +2.6.10 diff --git a/doc/configuration.txt b/doc/configuration.txt index 9b1cf3980..9e0b1dbb4 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -3,7 +3,7 @@ Configuration Manual ---------------------- version 2.6 - 2023/02/14 + 2023/03/10 This document covers the configuration language as implemented in the version From 883f35da4bff499b7ffe9c6b74d855c7ae6bc179 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 14 Mar 2023 14:33:11 +0100 Subject: [PATCH 116/140] BUG/MEDIUM: proxy: properly stop backends on soft-stop On soft-stop, we must properlu stop backends and not only proxies with at least a listener. This is mandatory in order to stop the health checks. A previous fix was provided to do so (ba29687bc1 "BUG/MEDIUM: proxy: properly stop backends"). However, only stop_proxy() function was fixed. When HAproxy is stopped, this function is no longer used. So the same kind of fix must be done on do_soft_stop_now(). This patch partially fixes the issue #1874. It must be backported as far as 2.4. (cherry picked from commit 48678e483f4f47e2a212a545399009b87503ea2d) Signed-off-by: Christopher Faulet (cherry picked from commit 6faf82b613a706f9751356ee933829ac8f4e8d18) Signed-off-by: Christopher Faulet --- src/proxy.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/proxy.c b/src/proxy.c index dbd122a80..3937809f7 100644 --- a/src/proxy.c +++ b/src/proxy.c @@ -2200,6 +2200,7 @@ struct task *hard_stop(struct task *t, void *context, unsigned int state) /* perform the soft-stop right now (i.e. unbind listeners) */ static void do_soft_stop_now() { + struct proxy *p; struct task *task; /* disable busy polling to avoid cpu eating for the new process */ @@ -2224,6 +2225,15 @@ static void do_soft_stop_now() /* stop all stoppable listeners */ protocol_stop_now(); + /* Loop on proxies to stop backends */ + p = proxies_list; + while (p) { + HA_RWLOCK_WRLOCK(PROXY_LOCK, &p->lock); + proxy_cond_disable(p); + HA_RWLOCK_WRUNLOCK(PROXY_LOCK, &p->lock); + p = p->next; + } + /* signal zero is used to broadcast the "stopping" event */ signal_handler(0); } From b3c4d38aa806d179954003e73a21150214fbae55 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 14 Mar 2023 14:41:55 +0100 Subject: [PATCH 117/140] BUG/MEDIUM: resolvers: Properly stop server resolutions on soft-stop When HAproxy is stopping, the DNS resolutions must be stopped, except those triggered from a "do-resolve" action. To do so, the resolutions themselves cannot be destroyed, the current design is too complex. However, it is possible to mute the resolvers tasks. The same is already performed with the health-checks. On soft-stop, the tasks are still running periodically but nothing if performed. For the resolvers, when the process is stopping, before running a resolution, we check all the requesters attached to this resolution. If t least a request is a stream or if there is a requester attached to a running proxy, a new resolution is triggered. Otherwise, we ignored the resolution. It will be evaluated again on the next wakeup. This way, "do-resolv" action are still working during soft-stop but other resoluation are stopped. Of course, it may be see as a feature and not a bug because it was never performed. But it is in fact not expected at all to still performing resolutions when HAProxy is stopping. In addution, a proxy option will be added to change this behavior. This patch partially fixes the issue #1874. It could be backported to 2.7 and maybe to 2.6. But no further. (cherry picked from commit 52ec6f14c4cbfe23cbe3bf6ef55af84ba17047de) Signed-off-by: Christopher Faulet (cherry picked from commit 058782b3f28d4ea6543f24d4996f97410f4e9f49) Signed-off-by: Christopher Faulet --- src/resolvers.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/resolvers.c b/src/resolvers.c index be22d551d..a814c2404 100644 --- a/src/resolvers.c +++ b/src/resolvers.c @@ -2394,6 +2394,48 @@ static struct task *process_resolvers(struct task *t, void *context, unsigned in /* Handle all resolutions in the wait list */ list_for_each_entry_safe(res, resback, &resolvers->resolutions.wait, list) { + + if (unlikely(stopping)) { + /* If haproxy is stopping, check if the resolution to know if it must be run or not. + * If at least a requester is a stream (because of a do-resolv action) or if there + * is a requester attached to a running proxy, the resolution is performed. + * Otherwise, it is skipped for now. + */ + struct resolv_requester *req; + int must_run = 0; + + list_for_each_entry(req, &res->requesters, list) { + struct proxy *px = NULL; + + switch (obj_type(req->owner)) { + case OBJ_TYPE_SERVER: + px = __objt_server(req->owner)->proxy; + break; + case OBJ_TYPE_SRVRQ: + px = __objt_resolv_srvrq(req->owner)->proxy; + break; + case OBJ_TYPE_STREAM: + /* Always perform the resolution */ + must_run = 1; + break; + default: + break; + } + /* Perform the resolution if the proxy is not stopped or disabled */ + if (px && !(px->flags & (PR_FL_DISABLED|PR_FL_STOPPED))) + must_run = 1; + + if (must_run) + break; + } + + if (!must_run) { + /* Skip the reolsution. reset it and wait for the next wakeup */ + resolv_reset_resolution(res); + continue; + } + } + if (LIST_ISEMPTY(&res->requesters)) { abort_resolution(res); continue; From 716ac838c451d85f0f42455bb1d6eac69a1db099 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 14 Mar 2023 15:48:06 +0100 Subject: [PATCH 118/140] DEBUG: cli/show_fd: Display connection error code When FD are dumps, the connection error code is now displayed. This may help to diagnose why a connection error occurred. This patch may be backported to help debugging. (cherry picked from commit d52f2ad6ee519ad272c6084089d8d853cefed788) Signed-off-by: Christopher Faulet (cherry picked from commit 3829656c7e95297755d30cf5dee24f4c270a9696) Signed-off-by: Christopher Faulet --- src/cli.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cli.c b/src/cli.c index 236311ad0..1dc37f68f 100644 --- a/src/cli.c +++ b/src/cli.c @@ -1296,6 +1296,7 @@ static int cli_io_handler_show_fd(struct appctx *appctx) const void *ctx = NULL; const void *xprt_ctx = NULL; uint32_t conn_flags = 0; + uint8_t conn_err = 0; int is_back = 0; int suspicious = 0; @@ -1313,6 +1314,7 @@ static int cli_io_handler_show_fd(struct appctx *appctx) else if (fdt.iocb == sock_conn_iocb) { conn = (const struct connection *)fdt.owner; conn_flags = conn->flags; + conn_err = conn->err_code; mux = conn->mux; ctx = conn->ctx; xprt = conn->xprt; @@ -1362,7 +1364,7 @@ static int cli_io_handler_show_fd(struct appctx *appctx) chunk_appendf(&trash, ")"); } else if (fdt.iocb == sock_conn_iocb) { - chunk_appendf(&trash, ") back=%d cflg=0x%08x", is_back, conn_flags); + chunk_appendf(&trash, ") back=%d cflg=0x%08x cerr=%d", is_back, conn_flags, conn_err); if (conn->handle.fd != fd) { chunk_appendf(&trash, " fd=%d(BOGUS)", conn->handle.fd); From 57bb44b9325b23bbeb92844d3ec244d26eb2de53 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Tue, 14 Mar 2023 15:51:33 +0100 Subject: [PATCH 119/140] DEBUG: ssl-sock/show_fd: Display SSL error code Like for connection error code, when FD are dumps, the ssl error code is now displayed. This may help to diagnose why a connection error occurred. This patch may be backported to help debugging. (cherry picked from commit f19c639787af445b92961356827f5db200215d97) Signed-off-by: Christopher Faulet (cherry picked from commit 92c1452111312e6af0db56e65051cea621a4d407) Signed-off-by: Christopher Faulet --- src/ssl_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ssl_sock.c b/src/ssl_sock.c index 2c47fcb00..919a08a88 100644 --- a/src/ssl_sock.c +++ b/src/ssl_sock.c @@ -7282,7 +7282,7 @@ static int ssl_sock_show_fd(struct buffer *buf, const struct connection *conn, c chunk_appendf(&trash, " xctx.conn=%p(BOGUS)", sctx->conn); ret = 1; } - chunk_appendf(&trash, " xctx.st=%d", sctx->xprt_st); + chunk_appendf(&trash, " xctx.st=%d .err=%ld", sctx->xprt_st, sctx->error_code); if (sctx->xprt) { chunk_appendf(&trash, " .xprt=%s", sctx->xprt->name); From d11759bd26dc6a4c1dafba6256fab48c81d1936d Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Fri, 10 Mar 2023 16:53:43 +0100 Subject: [PATCH 120/140] BUG/MINOR: tcp_sample: fix a bug in fc_dst_port and fc_dst_is_local sample fetches There is a bug in the smp_fetch_dport() function which affects the 'f' case, also known as 'fc_dst_port' sample fetch. conn_get_src() is used to retrieve the address prior to calling conn_dst(). But this is wrong: conn_get_dst() should be used instead. Because of that, conn_dst() may return unexpected results since the dst address is not guaranteed to be set depending on the conn state at the time the sample fetch is used. This was reported by Corin Langosch on the ML: during his tests he noticed that using fc_dst_port in a log-format string resulted in the correct value being printed in the logs but when he used it in an ACL, the ACL did not evaluate properly. This can be easily reproduced with the following test conf: |frontend test-http | bind 127.0.0.1:8080 | mode http | | acl test fc_dst_port eq 8080 | http-request return status 200 if test | http-request return status 500 if !test A request on 127.0.0.1:8080 should normally return 200 OK, but here it will return a 500. The same bug was also found in smp_fetch_dst_is_local() (fc_dst_is_local sample fetch) by reading the code: the fix was applied twice. This needs to be backported up to 2.5 [both sample fetches were introduced in 2.5 with 888cd70 ("MINOR: tcp-sample: Add samples to get original info about client connection")] (cherry picked from commit 819817fc5e3cc587fbd08d3f3bbf8d539ef1a633) Signed-off-by: Christopher Faulet (cherry picked from commit e01bee20926bd88d3c3a2ef3462eb44a01a10b7f) Signed-off-by: Christopher Faulet --- src/tcp_sample.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tcp_sample.c b/src/tcp_sample.c index 925b93291..12eb25c4e 100644 --- a/src/tcp_sample.c +++ b/src/tcp_sample.c @@ -176,7 +176,7 @@ int smp_fetch_dst_is_local(const struct arg *args, struct sample *smp, const cha if (kw[0] == 'f') { /* fc_dst_is_local */ struct connection *conn = objt_conn(smp->sess->origin); - if (conn && conn_get_src(conn)) + if (conn && conn_get_dst(conn)) dst = conn_dst(conn); } else /* dst_is_local */ @@ -232,10 +232,10 @@ smp_fetch_dport(const struct arg *args, struct sample *smp, const char *kw, void if (conn && conn_get_dst(conn)) dst = conn_dst(conn); } - else if (kw[0] == 'f') { /* fc_dst_post */ + else if (kw[0] == 'f') { /* fc_dst_port */ struct connection *conn = objt_conn(smp->sess->origin); - if (conn && conn_get_src(conn)) + if (conn && conn_get_dst(conn)) dst = conn_dst(conn); } else /* dst_port */ From a1327f36b0cddb39b330342684a838518c1163a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Thu, 16 Mar 2023 12:30:36 +0100 Subject: [PATCH 121/140] BUG/MINOR: quic: Missing STREAM frame length updates Some STREAM frame lengths were not updated before being duplicated, built of requeued contrary to their ack offsets. This leads haproxy to crash when receiving acknowledgements for such frames with this thread #1 backtrace: Thread 1 (Thread 0x7211b6ffd640 (LWP 986141)): #0 ha_crash_now () at include/haproxy/bug.h:52 No locals. #1 b_del (b=, del=) at include/haproxy/buf.h:436 No locals. #2 qc_stream_desc_ack (stream=stream@entry=0x7211b6fd9bc8, offset=offset@entry=53176, len=len@entry=1122) at src/quic_stream.c:111 Thank you to @Tristan971 for having provided such traces which reveal this issue: [04|quic|5|c_conn.c:1865] qc_requeue_nacked_pkt_tx_frms(): entering : qc@0x72119c22cfe0 [04|quic|5|_frame.c:1179] qc_frm_unref(): entering : qc@0x72119c22cfe0 [04|quic|5|_frame.c:1186] qc_frm_unref(): remove frame reference : qc@0x72119c22cfe0 frm@0x72118863d260 STREAM_F uni=0 fin=1 id=460 off=52957 len=1122 3244 [04|quic|5|_frame.c:1194] qc_frm_unref(): leaving : qc@0x72119c22cfe0 [04|quic|5|c_conn.c:1902] qc_requeue_nacked_pkt_tx_frms(): updated partially acked frame : qc@0x72119c22cfe0 frm@0x72119c472290 STREAM_F uni=0 fin=1 id=460 off=53176 len=1122 Note that haproxy has much more chance to crash if this frame is the last one (fin bit set). But another condition must be fullfilled to update the ack offset. A previous STREAM frame from the same stream with the same offset but with less data must be acknowledged by the peer. This is the condition to update the ack offset. For others frames without fin bit in the same conditions, I guess the stream may be truncated because too much data are removed from the stream when they are acknowledged. Must be backported to 2.6 and 2.7. (cherry picked from commit fc546ab6a7346605c62760569c9afd0c99c6dd76) Signed-off-by: Christopher Faulet (cherry picked from commit 3bcadac543ee69fa03cf7bf0be572522b59e8bb3) Signed-off-by: Christopher Faulet --- src/quic_conn.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/quic_conn.c b/src/quic_conn.c index 256cd0168..ca4d2edcf 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -1855,6 +1855,7 @@ static inline void qc_requeue_nacked_pkt_tx_frms(struct quic_conn *qc, } else if (strm_frm->offset.key < stream_desc->ack_offset) { strm_frm->offset.key = stream_desc->ack_offset; + strm_frm->len -= stream_desc->ack_offset - strm_frm->offset.key; TRACE_DEVEL("updated partially acked frame", QUIC_EV_CONN_PRSAFRM, qc, frm); } @@ -2501,6 +2502,7 @@ static void qc_dup_pkt_frms(struct quic_conn *qc, } else if (strm_frm->offset.key < stream_desc->ack_offset) { strm_frm->offset.key = stream_desc->ack_offset; + strm_frm->len -= stream_desc->ack_offset - strm_frm->offset.key; TRACE_DEVEL("updated partially acked frame", QUIC_EV_CONN_PRSAFRM, qc, frm); } @@ -6808,6 +6810,7 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, } else if (strm->offset.key < stream_desc->ack_offset) { strm->offset.key = stream_desc->ack_offset; + strm->len -= stream_desc->ack_offset - strm->offset.key; TRACE_DEVEL("updated partially acked frame", QUIC_EV_CONN_PRSAFRM, qc, cf); } From a53fdaf7203e45f67c44d7e250cec36875ea8e01 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Thu, 16 Mar 2023 11:43:05 +0100 Subject: [PATCH 122/140] BUG/MEDIUM: connection: Preserve flags when a conn is removed from an idle list The commit 5e1b0e7bf ("BUG/MEDIUM: connection: Clear flags when a conn is removed from an idle list") introduced a regression. CO_FL_SAFE_LIST and CO_FL_IDLE_LIST flags are used when the connection is released to properly decrement used/idle connection counters. if a connection is idle, these flags must be preserved till the connection is really released. It may be removed from the list but not immediately released. If these flags are lost when it is finally released, the current number of used connections is erroneously decremented. If means this counter may become negative and the counters tracking the number of idle connecitons is not decremented, suggesting a leak. So, the above commit is reverted and instead we improve a bit the way to detect an idle connection. The function conn_get_idle_flag() must now be used to know if a connection is in an idle list. It returns the connection flag corresponding to the idle list if the connection is idle (CO_FL_SAFE_LIST or CO_FL_IDLE_LIST) or 0 otherwise. But if the connection is scheduled to be removed, 0 is also returned, regardless the connection flags. This new function is used when the connection is temporarily removed from the list to be used, mainly in muxes. This patch should fix #2078 and #2057. It must be backported as far as 2.2. (cherry picked from commit 3a7b539b124bccaa57478e0a5a6d66338594615a) Signed-off-by: Christopher Faulet (cherry picked from commit a81a1e2aea0793aa624565a14cb7579b907f116a) Signed-off-by: Christopher Faulet --- include/haproxy/connection.h | 10 ++++++++++ src/connection.c | 2 +- src/mux_fcgi.c | 6 ++---- src/mux_h1.c | 6 ++---- src/mux_h2.c | 10 ++-------- src/server.c | 1 - src/ssl_sock.c | 2 +- 7 files changed, 18 insertions(+), 19 deletions(-) diff --git a/include/haproxy/connection.h b/include/haproxy/connection.h index 4d289e7b3..8cf22ef4f 100644 --- a/include/haproxy/connection.h +++ b/include/haproxy/connection.h @@ -316,6 +316,16 @@ static inline void conn_set_private(struct connection *conn) } } +/* Used to know if a connection is in an idle list. It returns connection flag + * corresponding to the idle list if the connection is idle (CO_FL_SAFE_LIST or + * CO_FL_IDLE_LIST) or 0 otherwise. Note that if the connection is scheduled to + * be removed, 0 is returned, regardless the connection flags. + */ +static inline unsigned int conn_get_idle_flag(const struct connection *conn) +{ + return (!MT_LIST_INLIST(&conn->toremove_list) ? conn->flags & CO_FL_LIST_MASK : 0); +} + static inline void conn_force_unsubscribe(struct connection *conn) { if (!conn->subs) diff --git a/src/connection.c b/src/connection.c index 4a73dbcc8..5a459fd98 100644 --- a/src/connection.c +++ b/src/connection.c @@ -146,7 +146,7 @@ int conn_notify_mux(struct connection *conn, int old_flags, int forced_wake) ((conn->flags ^ old_flags) & CO_FL_NOTIFY_DONE) || ((old_flags & CO_FL_WAIT_XPRT) && !(conn->flags & CO_FL_WAIT_XPRT))) && conn->mux && conn->mux->wake) { - uint conn_in_list = conn->flags & CO_FL_LIST_MASK; + uint conn_in_list = conn_get_idle_flag(conn); struct server *srv = objt_server(conn->target); if (conn_in_list) { diff --git a/src/mux_fcgi.c b/src/mux_fcgi.c index 4981f6bab..2c417dd1f 100644 --- a/src/mux_fcgi.c +++ b/src/mux_fcgi.c @@ -3043,7 +3043,7 @@ struct task *fcgi_io_cb(struct task *t, void *ctx, unsigned int state) conn = fconn->conn; TRACE_POINT(FCGI_EV_FCONN_WAKE, conn); - conn_in_list = conn->flags & CO_FL_LIST_MASK; + conn_in_list = conn_get_idle_flag(conn); if (conn_in_list) conn_delete_from_tree(&conn->hash_node->node); @@ -3227,10 +3227,8 @@ struct task *fcgi_timeout_task(struct task *t, void *context, unsigned int state /* We're about to destroy the connection, so make sure nobody attempts * to steal it from us. */ - if (fconn->conn->flags & CO_FL_LIST_MASK) { + if (fconn->conn->flags & CO_FL_LIST_MASK) conn_delete_from_tree(&fconn->conn->hash_node->node); - fconn->conn->flags &= ~CO_FL_LIST_MASK; - } HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } diff --git a/src/mux_h1.c b/src/mux_h1.c index 56b08a77e..6f59b3112 100644 --- a/src/mux_h1.c +++ b/src/mux_h1.c @@ -3158,7 +3158,7 @@ struct task *h1_io_cb(struct task *t, void *ctx, unsigned int state) /* Remove the connection from the list, to be sure nobody attempts * to use it while we handle the I/O events */ - conn_in_list = conn->flags & CO_FL_LIST_MASK; + conn_in_list = conn_get_idle_flag(conn); if (conn_in_list) conn_delete_from_tree(&conn->hash_node->node); @@ -3282,10 +3282,8 @@ struct task *h1_timeout_task(struct task *t, void *context, unsigned int state) /* We're about to destroy the connection, so make sure nobody attempts * to steal it from us. */ - if (h1c->conn->flags & CO_FL_LIST_MASK) { + if (h1c->conn->flags & CO_FL_LIST_MASK) conn_delete_from_tree(&h1c->conn->hash_node->node); - h1c->conn->flags &= ~CO_FL_LIST_MASK; - } HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } diff --git a/src/mux_h2.c b/src/mux_h2.c index f4cb5b188..b62a8f60e 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -4027,11 +4027,10 @@ struct task *h2_io_cb(struct task *t, void *ctx, unsigned int state) conn = h2c->conn; TRACE_ENTER(H2_EV_H2C_WAKE, conn); - conn_in_list = conn->flags & CO_FL_LIST_MASK; - /* Remove the connection from the list, to be sure nobody attempts * to use it while we handle the I/O events */ + conn_in_list = conn_get_idle_flag(conn); if (conn_in_list) conn_delete_from_tree(&conn->hash_node->node); @@ -4163,7 +4162,6 @@ static int h2_process(struct h2c *h2c) if (conn->flags & CO_FL_LIST_MASK) { HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); conn_delete_from_tree(&conn->hash_node->node); - conn->flags &= ~CO_FL_LIST_MASK; HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } } @@ -4172,7 +4170,6 @@ static int h2_process(struct h2c *h2c) if (conn->flags & CO_FL_LIST_MASK) { HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); conn_delete_from_tree(&conn->hash_node->node); - conn->flags &= ~CO_FL_LIST_MASK; HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } } @@ -4253,10 +4250,8 @@ struct task *h2_timeout_task(struct task *t, void *context, unsigned int state) /* We're about to destroy the connection, so make sure nobody attempts * to steal it from us. */ - if (h2c->conn->flags & CO_FL_LIST_MASK) { + if (h2c->conn->flags & CO_FL_LIST_MASK) conn_delete_from_tree(&h2c->conn->hash_node->node); - h2c->conn->flags &= ~CO_FL_LIST_MASK; - } HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } @@ -4309,7 +4304,6 @@ do_leave: if (h2c->conn->flags & CO_FL_LIST_MASK) { HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); conn_delete_from_tree(&h2c->conn->hash_node->node); - h2c->conn->flags &= ~CO_FL_LIST_MASK; HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); } diff --git a/src/server.c b/src/server.c index 8a282bcf9..d701eaeab 100644 --- a/src/server.c +++ b/src/server.c @@ -5717,7 +5717,6 @@ static int srv_migrate_conns_to_remove(struct eb_root *idle_tree, struct mt_list hash_node = ebmb_entry(node, struct conn_hash_node, node); eb_delete(node); - hash_node->conn->flags &= ~CO_FL_LIST_MASK; MT_LIST_APPEND(toremove_list, &hash_node->conn->toremove_list); i++; diff --git a/src/ssl_sock.c b/src/ssl_sock.c index 919a08a88..b2f937487 100644 --- a/src/ssl_sock.c +++ b/src/ssl_sock.c @@ -6481,7 +6481,7 @@ struct task *ssl_sock_io_cb(struct task *t, void *context, unsigned int state) return NULL; } conn = ctx->conn; - conn_in_list = conn->flags & CO_FL_LIST_MASK; + conn_in_list = conn_get_idle_flag(conn); if (conn_in_list) conn_delete_from_tree(&conn->hash_node->node); HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); From acbb81d9cb0cc70638c2bbcadc9c740fc3e7b17b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 16 Mar 2023 18:06:19 +0100 Subject: [PATCH 123/140] BUG/MINOR: mux-h2: make sure the h2c task exists before refreshing it When detaching a stream, if it's the last one and the mbuf is blocked, we leave without freeing the stream yet. We also refresh the h2c task's timeout, except that it's possible that there's no such task in case there is no client timeout, causing a crash. The fix just consists in doing this when the task exists. This bug has always been there and is extremely hard to meet even without a client timeout. This fix has to be backported to all branches, but it's unlikely anyone has ever met it anyay. (cherry picked from commit 3fb2c6d5b40bf64d7e4f3fef88739b757555fbff) Signed-off-by: Christopher Faulet (cherry picked from commit 5573b86da519086cfa78a0df821f675e25a34b7c) Signed-off-by: Christopher Faulet --- src/mux_h2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mux_h2.c b/src/mux_h2.c index b62a8f60e..9b3bc42a7 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -4445,7 +4445,7 @@ static void h2_detach(struct sedesc *sd) /* refresh the timeout if none was active, so that the last * leaving stream may arm it. */ - if (!tick_isset(h2c->task->expire)) + if (h2c->task && !tick_isset(h2c->task->expire)) h2c_update_timeout(h2c); return; } From b77482610d7b5983c3e7f5db798cf24a3d69e607 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 16 Mar 2023 17:30:04 +0100 Subject: [PATCH 124/140] MINOR: buffer: add br_single() to check if a buffer ring has more than one buf It's cheaper and cleaner than using br_count()==1 given that it just compares two indexes, and that a ring having a single buffer is in a special case where it is between empty and used up-to-1. In other words it's not congested. (cherry picked from commit 9824f8c8908d67f6cedf2434e12a23f18a27eaf0) Signed-off-by: Christopher Faulet (cherry picked from commit 6fd2e323a41d972fd0f6630ee02085366fd591bb) Signed-off-by: Christopher Faulet --- include/haproxy/buf.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/haproxy/buf.h b/include/haproxy/buf.h index bdefc23f4..f260cc155 100644 --- a/include/haproxy/buf.h +++ b/include/haproxy/buf.h @@ -1000,6 +1000,15 @@ static inline unsigned int br_full(const struct buffer *r) return r->data + 1 == r->head || r->data + 1 == r->head - 1 + r->size; } + +/* Returns true if a single buffer is assigned */ +static inline unsigned int br_single(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r->data == r->head; +} + /* Returns the index of the ring's head buffer */ static inline unsigned int br_head_idx(const struct buffer *r) { From be35f2a51123ccbcaf0320cbde0d694505450db0 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 16 Mar 2023 16:47:44 +0100 Subject: [PATCH 125/140] BUG/MEDIUM: mux-h2: only restart sending when mux buffer is decongested During performance tests, Emeric faced a case where the wakeups of sc_conn_io_cb() caused by h2_resume_each_sending_h2s() was multiplied by 5-50 and a lot of CPU was being spent doing this for apparently no reason. The culprit is h2_send() not behaving well with congested buffers and small SSL records. What happens when the output is congested is that all buffers are full, and data are emitted in 2kB chunks, which are sufficient to wake all streams up again to ask them to send data again, something that will obviously only work for one of them at best, and waste a lot of CPU in wakeups and memcpy() due to the small buffers. When this happens, the performance can be divided by 2-2.5 on large objects. Here the chosen solution against this is to keep in mind that as long as there are still at least two buffers in the ring after calling xprt->snd_buf(), it means that the output is congested and there's no point trying again, because these data will just be placed into such buffers and will wait there. Instead we only mark the buffer decongested once we're back to a single allocated buffer in the ring. By doing so we preserve the ability to deal with large concurrent bursts while not causing a thundering herd by waking all streams for almost nothing. This needs to be backported to 2.7 and 2.6. Other versions could benefit from it as well but it's not strictly necessary, and we can reconsider this option if some excess calls to sc_conn_io_cb() are faced. Note that this fix depends on this recent commit: MINOR: buffer: add br_single() to check if a buffer ring has more than one buf (cherry picked from commit 93c5511af8962d17c94e83d23e6568c4759c8eb6) Signed-off-by: Christopher Faulet (cherry picked from commit a3e25ada050926881f0c2a8f82d2dfded49b086c) Signed-off-by: Christopher Faulet --- src/mux_h2.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/mux_h2.c b/src/mux_h2.c index 9b3bc42a7..b6594eaa3 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -3970,8 +3970,17 @@ static int h2_send(struct h2c *h2c) if (released) offer_buffers(NULL, released); - /* wrote at least one byte, the buffer is not full anymore */ - if (sent) + /* Normally if wrote at least one byte, the buffer is not full + * anymore. However, if it was marked full because all of its + * buffers were used, we don't want to instantly wake up many + * streams because we'd create a thundering herd effect, notably + * when data are flushed in small chunks. Instead we wait for + * the buffer to be decongested again before allowing to send + * again. It also has the added benefit of not pumping more + * data from the other side when it's known that this one is + * still congested. + */ + if (sent && br_single(h2c->mbuf)) h2c->flags &= ~(H2_CF_MUX_MFULL | H2_CF_DEM_MROOM); } From 14e256dd6bc295e0e55c76891920118bf1f1cf75 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 16 Mar 2023 17:30:30 +0100 Subject: [PATCH 126/140] BUG/MINOR: mux-h2: set CO_SFL_STREAMER when sending lots of data Emeric noticed that h2 bit-rate performance was always slightly lower than h1 when the CPU is saturated. Strace showed that we were always data in 2kB chunks, corresponding to the max_record size. What's happening is that when this mechanism of dynamic record size was introduced, the STREAMER flag at the stream level was relied upon. Since all this was moved to the muxes, the flag has to be passed as an argument to the snd_buf() function, but the mux h2 did not use it despite a comment mentioning it, probably because before the multi-buf it was not easy to figure the status of the buffer. The solution here consists in checking if the mbuf is congested or not, by checking if it has more than one buffer allocated. If so we set the CO_SFL_STREAMER flag, otherwise we don't. This way moderate size exchanges continue to be made over small chunks, but downloads will be able to use the large ones. While it could be backported to all supported versions, it would be better to limit it to the last LTS, so let's do it for 2.7 and 2.6 only. This patch requires previous commit "MINOR: buffer: add br_single() to check if a buffer ring has more than one buf". (cherry picked from commit 14ea98af7392225c26afa9f97a33b913e091b02b) Signed-off-by: Christopher Faulet (cherry picked from commit 558ac459de1294ced755ace5efffdb7036baac58) Signed-off-by: Christopher Faulet --- src/mux_h2.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/mux_h2.c b/src/mux_h2.c index b6594eaa3..4ea3d498f 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -3948,6 +3948,16 @@ static int h2_send(struct h2c *h2c) if (h2c->flags & (H2_CF_MUX_MFULL | H2_CF_DEM_MBUSY | H2_CF_DEM_MROOM)) flags |= CO_SFL_MSG_MORE; + if (!br_single(h2c->mbuf)) { + /* usually we want to emit small TLS records to speed + * up the decoding on the client. That's what is being + * done by default. However if there is more than one + * buffer being allocated, we're streaming large data + * so we stich to large records. + */ + flags |= CO_SFL_STREAMER; + } + for (buf = br_head(h2c->mbuf); b_size(buf); buf = br_del_head(h2c->mbuf)) { if (b_data(buf)) { int ret = conn->xprt->snd_buf(conn, conn->xprt_ctx, buf, b_data(buf), flags); From a68b664ab7292c4bf446aa50a2f4016d78b4738d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20L=C3=A9caille?= Date: Fri, 17 Mar 2023 08:56:50 +0100 Subject: [PATCH 127/140] BUG/MINOR: quic: Missing STREAM frame data pointer updates This patch follows this one which was not sufficient: "BUG/MINOR: quic: Missing STREAM frame length updates" Indeed, it is not sufficient to update the ->len and ->offset member of a STREAM frame to move it forward. The data pointer must also be updated. This is not done by the STREAM frame builder. Must be backported to 2.6 and 2.7. (cherry picked from commit ca07979b978ed683b96d0ec99073c7cb972fd022) Signed-off-by: Willy Tarreau (cherry picked from commit dcc827589a3a351af315d1f4cf0126dd2dc8a746) [cf: ctx adjustment] Signed-off-by: Christopher Faulet --- include/haproxy/quic_frame.h | 13 +++++++++++++ src/quic_conn.c | 15 +++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/include/haproxy/quic_frame.h b/include/haproxy/quic_frame.h index c89323076..2a96aae58 100644 --- a/include/haproxy/quic_frame.h +++ b/include/haproxy/quic_frame.h @@ -176,5 +176,18 @@ static inline struct quic_err quic_err_app(uint64_t code) return (struct quic_err){ .code = code, .app = 1 }; } +/* Move forward STREAM frame by bytes. */ +static inline void qc_stream_frm_mv_fwd(struct quic_stream *strm, uint64_t data) +{ + struct buffer cf_buf; + + strm->offset.key += data; + strm->len -= data; + cf_buf = b_make(b_orig(strm->buf), + b_size(strm->buf), + (char *)strm->data - b_orig(strm->buf), 0); + strm->data = (unsigned char *)b_peek(&cf_buf, data); +} + #endif /* USE_QUIC */ #endif /* _HAPROXY_QUIC_FRAME_H */ diff --git a/src/quic_conn.c b/src/quic_conn.c index ca4d2edcf..789049b2a 100644 --- a/src/quic_conn.c +++ b/src/quic_conn.c @@ -1854,8 +1854,9 @@ static inline void qc_requeue_nacked_pkt_tx_frms(struct quic_conn *qc, continue; } else if (strm_frm->offset.key < stream_desc->ack_offset) { - strm_frm->offset.key = stream_desc->ack_offset; - strm_frm->len -= stream_desc->ack_offset - strm_frm->offset.key; + uint64_t diff = stream_desc->ack_offset - strm_frm->offset.key; + + qc_stream_frm_mv_fwd(strm_frm, diff); TRACE_DEVEL("updated partially acked frame", QUIC_EV_CONN_PRSAFRM, qc, frm); } @@ -2501,8 +2502,9 @@ static void qc_dup_pkt_frms(struct quic_conn *qc, continue; } else if (strm_frm->offset.key < stream_desc->ack_offset) { - strm_frm->offset.key = stream_desc->ack_offset; - strm_frm->len -= stream_desc->ack_offset - strm_frm->offset.key; + uint64_t diff = stream_desc->ack_offset - strm_frm->offset.key; + + qc_stream_frm_mv_fwd(strm_frm, diff); TRACE_DEVEL("updated partially acked frame", QUIC_EV_CONN_PRSAFRM, qc, frm); } @@ -6809,8 +6811,9 @@ static inline int qc_build_frms(struct list *outlist, struct list *inlist, continue; } else if (strm->offset.key < stream_desc->ack_offset) { - strm->offset.key = stream_desc->ack_offset; - strm->len -= stream_desc->ack_offset - strm->offset.key; + uint64_t diff = stream_desc->ack_offset - strm->offset.key; + + qc_stream_frm_mv_fwd(strm, diff); TRACE_DEVEL("updated partially acked frame", QUIC_EV_CONN_PRSAFRM, qc, cf); } From f1a031aca71dced4e66c36bf5878d2e976cbe9b0 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Wed, 11 Jan 2023 10:59:52 +0100 Subject: [PATCH 128/140] BUG/MEDIUM: listener: duplicate inherited FDs if needed Since commit 36d9097cf ("MINOR: fd: Add BUG_ON checks on fd_insert()"), there is currently a test in fd_insert() to detect that we're not trying to reinsert an FD that had already been inserted. This test catches the following anomalies: frontend fail1 bind fd@0 bind fd@0 and: frontend fail2 bind fd@0 shards 2 What happens is that clone_listener() is called on a listener already having an FD, and when sock_{inet,unix}_bind_receiver() are called, the same FD will be registered multiple times and rightfully crash in the sanity check. It wouldn't be correct to block shards though (e.g. they could be used in a default-bind line). What looks like a safer and more future-proof approach simply is to dup() the FD so that each listener has one copy. This is also the only solution that might allow later to support more than 64 threads on an inherited FD. This needs to be backported as far as 2.4. Better wait for at least one extra -dev version before backporting though, as the bug should not be triggered often anyway. (cherry picked from commit 145b17fd2f6857e45ca184419f29f41343446fc5) Signed-off-by: Willy Tarreau (cherry picked from commit 2aa96f003af84fc68b89efdfb4d6f1866f13788a) Signed-off-by: Christopher Faulet --- src/sock_inet.c | 18 ++++++++++++++++++ src/sock_unix.c | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/src/sock_inet.c b/src/sock_inet.c index 46cc16a1c..2bd27df96 100644 --- a/src/sock_inet.c +++ b/src/sock_inet.c @@ -312,6 +312,24 @@ int sock_inet_bind_receiver(struct receiver *rx, char **errmsg) } } + if (ext && fd < global.maxsock && fdtab[fd].owner) { + /* This FD was already bound so this means that it was already + * known and registered before parsing, hence it's an inherited + * FD. The only reason why it's already known here is that it + * has been registered multiple times (multiple listeners on the + * same, or a "shards" directive on the line). There cannot be + * multiple listeners on one FD but at least we can create a + * new one from the original one. We won't reconfigure it, + * however, as this was already done for the first one. + */ + fd = dup(fd); + if (fd == -1) { + err |= ERR_RETRYABLE | ERR_ALERT; + memprintf(errmsg, "cannot dup() receiving socket (%s)", strerror(errno)); + goto bind_return; + } + } + if (fd >= global.maxsock) { err |= ERR_FATAL | ERR_ABORT | ERR_ALERT; memprintf(errmsg, "not enough free sockets (raise '-n' parameter)"); diff --git a/src/sock_unix.c b/src/sock_unix.c index 143295e79..47f8566ec 100644 --- a/src/sock_unix.c +++ b/src/sock_unix.c @@ -226,6 +226,24 @@ int sock_unix_bind_receiver(struct receiver *rx, char **errmsg) } fd_ready: + if (ext && fd < global.maxsock && fdtab[fd].owner) { + /* This FD was already bound so this means that it was already + * known and registered before parsing, hence it's an inherited + * FD. The only reason why it's already known here is that it + * has been registered multiple times (multiple listeners on the + * same, or a "shards" directive on the line). There cannot be + * multiple listeners on one FD but at least we can create a + * new one from the original one. We won't reconfigure it, + * however, as this was already done for the first one. + */ + fd = dup(fd); + if (fd == -1) { + err |= ERR_RETRYABLE | ERR_ALERT; + memprintf(errmsg, "cannot dup() receiving socket (%s)", strerror(errno)); + goto bind_return; + } + } + if (fd >= global.maxsock) { err |= ERR_FATAL | ERR_ABORT | ERR_ALERT; memprintf(errmsg, "not enough free sockets (raise '-n' parameter)"); From 70d53f0e77ff6d6c6fc079ce3e40b35b671e317a Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Thu, 26 Jan 2023 09:38:53 +0100 Subject: [PATCH 129/140] MINOR: h2: add h2_phdr_to_ist() to make ISTs from pseudo headers Till now pseudo headers were passed as const strings, but having them as ISTs will be more convenient for traces. This doesn't change anything for strings which are derived from them (and being constants they're still zero-terminated). (cherry picked from commit 271c440392242b5129ffc68787b781b1c39b4618) [wt: will be used for h2 header traces] Signed-off-by: Willy Tarreau (cherry picked from commit 666d816bbb9ea25dc006a2a8de4335db242f8022) Signed-off-by: Christopher Faulet --- include/haproxy/h2.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/include/haproxy/h2.h b/include/haproxy/h2.h index 8d2aa9511..84e4c76fc 100644 --- a/include/haproxy/h2.h +++ b/include/haproxy/h2.h @@ -318,19 +318,27 @@ static inline int h2_str_to_phdr(const struct ist str) return 0; } +/* returns the pseudo-header name as an ist, or ":UNKNOWN" if unknown. + * Note that all strings are zero-terminated constants. + */ +static inline struct ist h2_phdr_to_ist(int phdr) +{ + switch (phdr) { + case H2_PHDR_IDX_NONE: return ist(":NONE"); + case H2_PHDR_IDX_AUTH: return ist(":authority"); + case H2_PHDR_IDX_METH: return ist(":method"); + case H2_PHDR_IDX_PATH: return ist(":path"); + case H2_PHDR_IDX_SCHM: return ist(":scheme"); + case H2_PHDR_IDX_STAT: return ist(":status"); + case H2_PHDR_IDX_HOST: return ist("Host"); + default: return ist(":UNKNOWN"); + } +} + /* returns the pseudo-header name as a string, or ":UNKNOWN" if unknown */ static inline const char *h2_phdr_to_str(int phdr) { - switch (phdr) { - case H2_PHDR_IDX_NONE: return ":NONE"; - case H2_PHDR_IDX_AUTH: return ":authority"; - case H2_PHDR_IDX_METH: return ":method"; - case H2_PHDR_IDX_PATH: return ":path"; - case H2_PHDR_IDX_SCHM: return ":scheme"; - case H2_PHDR_IDX_STAT: return ":status"; - case H2_PHDR_IDX_HOST: return "Host"; - default: return ":UNKNOWN"; - } + return h2_phdr_to_ist(phdr).ptr; } #endif /* _HAPROXY_H2_H */ From 17089e72d2ea4b855ab5909e97d8601594b19362 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 24 Jan 2023 19:43:11 +0100 Subject: [PATCH 130/140] MEDIUM: mux-h2/trace: add tracing support for headers Now we can make use of TRACE_PRINTF() to iterate over headers as they are received or dumped. It's worth noting that the dumps may occasionally be interrupted due to a buffer full or a realign, but in this case it will be visible because the trace will restart from the first one. All these headers (and trailers) may be interleaved with other connections' so they're all preceeded by the pointer to the connection and optionally the stream (or alternately the stream ID) to help discriminating them. Since it's not easy to read the header directions, sent headers are prefixed with "sndh" and received headers are prefixed with "rcvh", both of which are rare enough in the traces to conveniently support a quick grep. In order to avoid code duplication, h2_encode_headers() was implemented as a wrapper on top of hpack_encode_header(), which optionally emits the header to the trace if the trace is active. In addition, for headers that are encoded using a different method, h2_trace_header() was added as well. Header names are truncated to 256 bytes and values to 1024 bytes. If the lengths are larger, they will be truncated and suffixed with "(... +xxx)" where "xxx" is the number of extra bytes. Example of what an end-to-end H2 request gives: [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c13120(F,FRP) dsi=1 rcvh :method: GET [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c13120(F,FRP) dsi=1 rcvh :path: / [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c13120(F,FRP) dsi=1 rcvh :scheme: http [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c13120(F,FRP) dsi=1 rcvh :authority: localhost:14446 [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c13120(F,FRP) dsi=1 rcvh user-agent: curl/7.54.1 [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c13120(F,FRP) dsi=1 rcvh accept: */* [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c13120(F,FRP) dsi=1 rcvh cookie: blah [00|h2|5|mux_h2.c:5491] h2s_bck_make_req_headers(): h2c=0x1c1cd90(B,FRH) h2s=0x1c1e3d0(1,IDL) sndh :method: GET [00|h2|5|mux_h2.c:5572] h2s_bck_make_req_headers(): h2c=0x1c1cd90(B,FRH) h2s=0x1c1e3d0(1,IDL) sndh :authority: localhost:14446 [00|h2|5|mux_h2.c:5596] h2s_bck_make_req_headers(): h2c=0x1c1cd90(B,FRH) h2s=0x1c1e3d0(1,IDL) sndh :path: / [00|h2|5|mux_h2.c:5647] h2s_bck_make_req_headers(): h2c=0x1c1cd90(B,FRH) h2s=0x1c1e3d0(1,IDL) sndh user-agent: curl/7.54.1 [00|h2|5|mux_h2.c:5647] h2s_bck_make_req_headers(): h2c=0x1c1cd90(B,FRH) h2s=0x1c1e3d0(1,IDL) sndh accept: */* [00|h2|5|mux_h2.c:5647] h2s_bck_make_req_headers(): h2c=0x1c1cd90(B,FRH) h2s=0x1c1e3d0(1,IDL) sndh cookie: blah [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c1cd90(B,FRP) dsi=1 rcvh :status: 200 [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c1cd90(B,FRP) dsi=1 rcvh content-length: 0 [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c1cd90(B,FRP) dsi=1 rcvh x-req: size=102, time=0 ms [00|h2|5|mux_h2.c:4818] h2c_decode_headers(): h2c=0x1c1cd90(B,FRP) dsi=1 rcvh x-rsp: id=dummy, code=200, cache=1, size=0, time=0 ms (0 real) [00|h2|5|mux_h2.c:5210] h2s_frt_make_resp_headers(): h2c=0x1c13120(F,FRH) h2s=0x1c1c780(1,HCR) sndh :status: 200 [00|h2|5|mux_h2.c:5231] h2s_frt_make_resp_headers(): h2c=0x1c13120(F,FRH) h2s=0x1c1c780(1,HCR) sndh content-length: 0 [00|h2|5|mux_h2.c:5231] h2s_frt_make_resp_headers(): h2c=0x1c13120(F,FRH) h2s=0x1c1c780(1,HCR) sndh x-req: size=102, time=0 ms [00|h2|5|mux_h2.c:5231] h2s_frt_make_resp_headers(): h2c=0x1c13120(F,FRH) h2s=0x1c1c780(1,HCR) sndh x-rsp: id=dummy, code=200, cache=1, size=0, time=0 ms (0 real) At some point the frontend/backend names would be useful but that's a more general comment than just the H2 traces. (cherry picked from commit 11e8a8c2ac553c3dcbedddef4e3b444a02f21669) [wt: backported since it significantly helps for debugging; requires the following fix] Signed-off-by: Willy Tarreau (cherry picked from commit 53413179f06ea01965d8e8862560acc88549c638) Signed-off-by: Christopher Faulet --- src/mux_h2.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 119 insertions(+), 8 deletions(-) diff --git a/src/mux_h2.c b/src/mux_h2.c index 4ea3d498f..c5d6dca8b 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -994,6 +994,80 @@ static int h2_avail_streams(struct connection *conn) return ret1; } +/* inconditionally produce a trace of the header. Please do not call this one + * and use h2_trace_header() instead which first checks if traces are enabled. + */ +void _h2_trace_header(const struct ist hn, const struct ist hv, + uint64_t mask, const struct ist trc_loc, const char *func, + const struct h2c *h2c, const struct h2s *h2s) +{ + struct ist n_ist, v_ist; + const char *c_str, *s_str; + + chunk_reset(&trash); + c_str = chunk_newstr(&trash); + if (h2c) { + chunk_appendf(&trash, "h2c=%p(%c,%s) ", + h2c, (h2c->flags & H2_CF_IS_BACK) ? 'B' : 'F', h2c_st_to_str(h2c->st0)); + } + + s_str = chunk_newstr(&trash); + if (h2s) { + if (h2s->id <= 0) + chunk_appendf(&trash, "dsi=%d ", h2s->h2c->dsi); + chunk_appendf(&trash, "h2s=%p(%d,%s) ", h2s, h2s->id, h2s_st_to_str(h2s->st)); + } + else if (h2c) + chunk_appendf(&trash, "dsi=%d ", h2c->dsi); + + n_ist = ist2(chunk_newstr(&trash), 0); + istscpy(&n_ist, hn, 256); + trash.data += n_ist.len; + if (n_ist.len != hn.len) + chunk_appendf(&trash, " (... +%ld)", (long)(hn.len - n_ist.len)); + + v_ist = ist2(chunk_newstr(&trash), 0); + istscpy(&v_ist, hv, 1024); + trash.data += v_ist.len; + if (v_ist.len != hv.len) + chunk_appendf(&trash, " (... +%ld)", (long)(hv.len - v_ist.len)); + + TRACE_PRINTF_LOC(TRACE_LEVEL_USER, mask, trc_loc, func, + h2c->conn, 0, 0, 0, + "%s%s%s %s: %s", c_str, s_str, + (mask & H2_EV_TX_HDR) ? "sndh" : "rcvh", + n_ist.ptr, v_ist.ptr); +} + +/* produce a trace of the header after checking that tracing is enabled */ +static inline void h2_trace_header(const struct ist hn, const struct ist hv, + uint64_t mask, const struct ist trc_loc, const char *func, + const struct h2c *h2c, const struct h2s *h2s) +{ + if ((TRACE_SOURCE)->verbosity >= H2_VERB_ADVANCED && + TRACE_ENABLED(TRACE_LEVEL_USER, mask, h2c ? h2c->conn : 0, h2s, 0, 0)) + _h2_trace_header(hn, hv, mask, trc_loc, func, h2c, h2s); +} + +/* hpack-encode header name and value , possibly emitting a trace if + * currently enabled. This is done on behalf of function at + * passed as ist(TRC_LOC), h2c , and h2s , all of which may be NULL. + * The trace is only emitted if the header is emitted (in which case non-zero + * is returned). The trash is modified. In the traces, the header's name will + * be truncated to 256 chars and the header's value to 1024 chars. + */ +static inline int h2_encode_header(struct buffer *buf, const struct ist hn, const struct ist hv, + uint64_t mask, const struct ist trc_loc, const char *func, + const struct h2c *h2c, const struct h2s *h2s) +{ + int ret; + + ret = hpack_encode_header(buf, hn, hv); + if (ret) + h2_trace_header(hn, hv, mask, trc_loc, func, h2c, h2s); + + return ret; +} /*****************************************************************/ /* functions below are dedicated to the mux setup and management */ @@ -4954,6 +5028,26 @@ next_frame: /* past this point we cannot roll back in case of error */ outlen = hpack_decode_frame(h2c->ddht, hdrs, flen, list, sizeof(list)/sizeof(list[0]), tmp); + + if (outlen > 0 && + (TRACE_SOURCE)->verbosity >= H2_VERB_ADVANCED && + TRACE_ENABLED(TRACE_LEVEL_USER, H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, 0, 0, 0)) { + struct ist n; + int i; + + for (i = 0; list[i].n.len; i++) { + n = list[i].n; + + if (!isttest(n)) { + /* this is in fact a pseudo header whose number is in n.len */ + n = h2_phdr_to_ist(n.len); + } + + h2_trace_header(n, list[i].v, H2_EV_RX_FRAME|H2_EV_RX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, NULL); + } + } + if (outlen < 0) { TRACE_STATE("failed to decompress HPACK", H2_EV_RX_FRAME|H2_EV_RX_HDR|H2_EV_H2C_ERR|H2_EV_PROTO_ERR, h2c->conn); h2c_error(h2c, H2_ERR_COMPRESSION_ERROR); @@ -5345,6 +5439,14 @@ static size_t h2s_frt_make_resp_headers(struct h2s *h2s, struct htx *htx) goto full; } + if ((TRACE_SOURCE)->verbosity >= H2_VERB_ADVANCED) { + char sts[4]; + + h2_trace_header(ist(":status"), ist(ultoa_r(h2s->status, sts, sizeof(sts))), + H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, + h2c, h2s); + } + /* encode all headers, stop at empty name */ for (hdr = 0; hdr < sizeof(list)/sizeof(list[0]); hdr++) { /* these ones do not exist in H2 and must be dropped. */ @@ -5362,7 +5464,8 @@ static size_t h2s_frt_make_resp_headers(struct h2s *h2s, struct htx *htx) if (isteq(list[hdr].n, ist(""))) break; // end - if (!hpack_encode_header(&outbuf, list[hdr].n, list[hdr].v)) { + if (!h2_encode_header(&outbuf, list[hdr].n, list[hdr].v, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { /* output full */ if (b_space_wraps(mbuf)) goto realign_again; @@ -5629,6 +5732,8 @@ static size_t h2s_bck_make_req_headers(struct h2s *h2s, struct htx *htx) goto full; } + h2_trace_header(ist(":method"), meth, H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, h2c, h2s); + auth = ist(NULL); /* RFC7540 #8.3: the CONNECT method must have : @@ -5642,12 +5747,14 @@ static size_t h2s_bck_make_req_headers(struct h2s *h2s, struct htx *htx) if (unlikely(sl->info.req.meth == HTTP_METH_CONNECT) && !extended_connect) { auth = uri; - if (!hpack_encode_header(&outbuf, ist(":authority"), auth)) { + if (!h2_encode_header(&outbuf, ist(":authority"), auth, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { /* output full */ if (b_space_wraps(mbuf)) goto realign_again; goto full; } + h2s->flags |= H2_SF_BODY_TUNNEL; } else { /* other methods need a :scheme. If an authority is known from @@ -5707,7 +5814,9 @@ static size_t h2s_bck_make_req_headers(struct h2s *h2s, struct htx *htx) goto full; } - if (auth.len && !hpack_encode_header(&outbuf, ist(":authority"), auth)) { + if (auth.len && + !h2_encode_header(&outbuf, ist(":authority"), auth, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { /* output full */ if (b_space_wraps(mbuf)) goto realign_again; @@ -5731,15 +5840,16 @@ static size_t h2s_bck_make_req_headers(struct h2s *h2s, struct htx *htx) goto full; } + h2_trace_header(ist(":path"), uri, H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, h2c, h2s); + /* encode the pseudo-header protocol from rfc8441 if using * Extended CONNECT method. */ if (unlikely(extended_connect)) { const struct ist protocol = ist(h2s->upgrade_protocol); if (isttest(protocol)) { - if (!hpack_encode_header(&outbuf, - ist(":protocol"), - protocol)) { + if (!h2_encode_header(&outbuf, ist(":protocol"), protocol, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { /* output full */ if (b_space_wraps(mbuf)) goto realign_again; @@ -5782,7 +5892,7 @@ static size_t h2s_bck_make_req_headers(struct h2s *h2s, struct htx *htx) if (isteq(n, ist(""))) break; // end - if (!hpack_encode_header(&outbuf, n, v)) { + if (!h2_encode_header(&outbuf, n, v, H2_EV_TX_FRAME|H2_EV_TX_HDR, ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { /* output full */ if (b_space_wraps(mbuf)) goto realign_again; @@ -6351,7 +6461,8 @@ static size_t h2s_make_trailers(struct h2s *h2s, struct htx *htx) if (*(list[idx].n.ptr) == ':') continue; - if (!hpack_encode_header(&outbuf, list[idx].n, list[idx].v)) { + if (!h2_encode_header(&outbuf, list[idx].n, list[idx].v, H2_EV_TX_FRAME|H2_EV_TX_HDR, + ist(TRC_LOC), __FUNCTION__, h2c, h2s)) { /* output full */ if (b_space_wraps(mbuf)) goto realign_again; From cf68c1918a15aa1afae7fb5fc6472df7dedaa4f9 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Mon, 30 Jan 2023 08:26:09 +0100 Subject: [PATCH 131/140] BUG/MINOR: mux-h2: Fix possible null pointer deref on h2c in _h2_trace_header() As reported by Coverity, this function may be called with no h2c. Thus, the pointer must always be checked before any access. One test was missing in TRACE_PRINTF_LOC(). This patch should fix the issue #2015. No backport needed, except if the commit 11e8a8c2a ("MEDIUM: mux-h2/trace: add tracing support for headers") is backported. (cherry picked from commit c254516c5347052041630f9769ca1cd155d4f1d0) Signed-off-by: Willy Tarreau (cherry picked from commit b899aabd7aa8cf1daff519169fab3993a952812f) Signed-off-by: Christopher Faulet --- src/mux_h2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mux_h2.c b/src/mux_h2.c index c5d6dca8b..0f7574dcc 100644 --- a/src/mux_h2.c +++ b/src/mux_h2.c @@ -1033,7 +1033,7 @@ void _h2_trace_header(const struct ist hn, const struct ist hv, chunk_appendf(&trash, " (... +%ld)", (long)(hv.len - v_ist.len)); TRACE_PRINTF_LOC(TRACE_LEVEL_USER, mask, trc_loc, func, - h2c->conn, 0, 0, 0, + (h2c ? h2c->conn : 0), 0, 0, 0, "%s%s%s %s: %s", c_str, s_str, (mask & H2_EV_TX_HDR) ? "sndh" : "rcvh", n_ist.ptr, v_ist.ptr); From 6b87f11bbc07514b671317e2ab7567abddbbacd5 Mon Sep 17 00:00:00 2001 From: Christopher Faulet Date: Mon, 13 Feb 2023 11:37:26 +0100 Subject: [PATCH 132/140] BUG/MEDIUM: spoe: Don't set the default traget for the SPOE agent frontend The commit d5983cef8 ("MINOR: listener: remove the useless ->default_target field") revealed a bug in the SPOE. No default-target must be defined for the SPOE agent frontend. SPOE applets are used on the frontend side and a TCP connection is established on the backend side. Because of this bug, since the commit above, the stream target is set to the SPOE applet instead of the backend connection, leading to a spinning loop on the applet when it is released because are unable to close the backend side. This patch should fix the issue #2040. It only affects the 2.8-DEV but to avoid any future bug, it should be backported to all stable versions. (cherry picked from commit 25e36bfc228e8398171551f8c12fec29dcad0eae) Signed-off-by: Willy Tarreau (cherry picked from commit 5c8f140e5aff5f496d1694a3216ea354223cb9ef) Signed-off-by: Christopher Faulet --- src/flt_spoe.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/flt_spoe.c b/src/flt_spoe.c index 853452f53..1c511aecb 100644 --- a/src/flt_spoe.c +++ b/src/flt_spoe.c @@ -3053,7 +3053,6 @@ spoe_init(struct proxy *px, struct flt_conf *fconf) conf->agent_fe.accept = frontend_accept; conf->agent_fe.srv = NULL; conf->agent_fe.timeout.client = TICK_ETERNITY; - conf->agent_fe.default_target = &spoe_applet.obj_type; conf->agent_fe.fe_req_ana = AN_REQ_SWITCHING_RULES; if (!sighandler_registered) { From d59e6240389a390c63fac6b5c2419fbd531ed026 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Mon, 6 Feb 2023 18:50:51 +0100 Subject: [PATCH 133/140] BUG/MINOR: proto_ux: report correct error when bind_listener fails In uxst_bind_listener() and uxdg_bind_listener(), when the function fails because the listener is not bound, both function are setting the error message but don't set the err status before returning. Because of this, such error is not properly handled by the upper functions. Making sure this error is properly catched by returning a composition of ERR_FATAL and ERR_ALERT. This could be backported up to 2.4. (cherry picked from commit d861dc9b48789f9c6c7f092d1a3114695cfbbe0d) Signed-off-by: Willy Tarreau (cherry picked from commit ab2d4405205575b46cb13fe34216a6d5896eca0a) Signed-off-by: Christopher Faulet --- src/proto_uxdg.c | 1 + src/proto_uxst.c | 1 + 2 files changed, 2 insertions(+) diff --git a/src/proto_uxdg.c b/src/proto_uxdg.c index 41e01004f..68fe207de 100644 --- a/src/proto_uxdg.c +++ b/src/proto_uxdg.c @@ -95,6 +95,7 @@ int uxdg_bind_listener(struct listener *listener, char *errmsg, int errlen) if (!(listener->rx.flags & RX_F_BOUND)) { msg = "receiving socket not bound"; + err |= ERR_FATAL | ERR_ALERT; goto uxdg_return; } diff --git a/src/proto_uxst.c b/src/proto_uxst.c index c9639e76e..fd22e95bb 100644 --- a/src/proto_uxst.c +++ b/src/proto_uxst.c @@ -119,6 +119,7 @@ static int uxst_bind_listener(struct listener *listener, char *errmsg, int errle if (!(listener->rx.flags & RX_F_BOUND)) { msg = "receiving socket not bound"; + err |= ERR_FATAL | ERR_ALERT; goto uxst_return; } From 44eef1b3b566ec73a6d242ca347e6b6111dfabaa Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Tue, 7 Feb 2023 15:51:58 +0100 Subject: [PATCH 134/140] BUG/MINOR: protocol: fix minor memory leak in protocol_bind_all() In protocol_bind_all() (involved in startup sequence): We only free errmsg (set by fam->bind() attempt) when we make use of it. But this could lead to some memory leaks because there are some cases where we ignore the error message (e.g: verbose=0 with ERR_WARN messages). As long as errmsg is set, we should always free it. As mentioned earlier, this really is a minor leak because it can only occur on specific conditions (error paths) during the startup phase. This may be backported up to 2.4. -- Backport notes: -> 2.4 only: Replace this: | ha_warning("Binding [%s:%d] for %s %s: %s\n", | listener->bind_conf->file, listener->bind_conf->line, | proxy_type_str(px), px->id, errmsg); By this: | else if (lerr & ERR_WARN) | ha_warning("Starting %s %s: %s\n", | proxy_type_str(px), px->id, errmsg); (cherry picked from commit 8429627e3c2eb472dc94ec8d3d7275ef68a79128) Signed-off-by: Willy Tarreau (cherry picked from commit da9a15ff0326d59ba38f5a1b258820d91c7df649) Signed-off-by: Christopher Faulet --- src/protocol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/protocol.c b/src/protocol.c index 03f708591..146733a3f 100644 --- a/src/protocol.c +++ b/src/protocol.c @@ -92,8 +92,10 @@ int protocol_bind_all(int verbose) ha_warning("Binding [%s:%d] for %s %s: %s\n", listener->bind_conf->file, listener->bind_conf->line, proxy_type_str(px), px->id, errmsg); - ha_free(&errmsg); } + if (lerr != ERR_NONE) + ha_free(&errmsg); + if (lerr & ERR_ABORT) break; From 985e257b1fa5534ef8cebd7a0aa075dd10327604 Mon Sep 17 00:00:00 2001 From: Aurelien DARRAGON Date: Tue, 21 Feb 2023 17:33:50 +0100 Subject: [PATCH 135/140] BUG/MINOR: sock_unix: match finalname with tempname in sock_unix_addrcmp() In sock_unix_addrcmp(), named UNIX sockets paths are manually compared in order to properly handle tempname paths (ending with ".XXXX.tmp") that result from the 2-step bind implemented in sock_unix_bind_receiver(). However, this logic does not take into account "final" path names (without the ".XXXX.tmp" suffix). Example: /tmp/test did not match with /tmp/test.1288.tmp prior to this patch Indeed, depending on how the socket addr is retrieved, the same socket could be designated either by its tempname or finalname. socket addr is normally stored with its finalname within a receiver, but a call to getsockname() on the same socket will return the tempname that was used for the bind() call (sock_get_old_sockets() depends on getsockname()). This causes sock_find_compatible_fd() to malfunction with named UNIX sockets (ie: haproxy -x CLI option). To fix this, we slightly modify the check around the temp suffix in sock_unix_addrcmp(): we perform the suffix check even if one of the paths is lacking the temp suffix (with proper precautions). Now the function is able to match: - finalname x finalname - tempname x tempname - finalname x tempname That is: /tmp/test == /tmp/test.1288.tmp == /tmp/test.X.tmp It should be backported up to 2.4 (cherry picked from commit 2a7903bbb2102867132d9821913c51cb1b938962) Signed-off-by: Willy Tarreau (cherry picked from commit af3fa8c632abd425278bd8a8ae4ca89e41e92064) Signed-off-by: Christopher Faulet --- src/sock_unix.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/sock_unix.c b/src/sock_unix.c index 47f8566ec..1c3dbd761 100644 --- a/src/sock_unix.c +++ b/src/sock_unix.c @@ -93,7 +93,21 @@ int sock_unix_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_st /* Now we have a difference. It's OK if they are within or after a * sequence of digits following a dot, and are followed by ".tmp". + * + * make sure to perform the check against tempname if the compared + * string is in "final" format (does not end with ".XXXX.tmp"). + * + * Examples: + * /tmp/test matches with /tmp/test.1822.tmp + * /tmp/test.1822.tmp matches with /tmp/test.XXXX.tmp */ + if (au->sun_path[idx] == 0 || bu->sun_path[idx] == 0) { + if (au->sun_path[idx] == '.' || bu->sun_path[idx] == '.') + dot = idx; /* try to match against temp path */ + else + return -1; /* invalid temp path */ + } + if (!dot) return -1; From 158c9f2a35a5ddca9a5b0849373f3e3ef5d5e82f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 24 Jan 2023 17:48:53 +0100 Subject: [PATCH 136/140] MINOR: trace: add a TRACE_ENABLED() macro to determine if a trace is active Sometimes it would be necessary to prepare some messages, pre-process some blocks or maybe duplicate some contents before they vanish for the purpose of tracing them. However we don't want to do that for everything that is submitted to the traces, it's important to do it only for what will really be traced. The __trace() function has all the knowledge for this, to the point of even checking the lockon pointers. This commit splits the function in two, one with the trace decision logic, and the other one for the trace production. The first one is now usable through wrappers such as _trace_enabled() and TRACE_ENABLED() which will indicate whether traces are going to be produced for the current source, level, event mask, parameters and tracking. (cherry picked from commit 8f9a9704bb7f76d3266ef6cc1ff126236e96f119) [wt: will be needed for further QUIC patches] Signed-off-by: Willy Tarreau (cherry picked from commit e2ba91e9ea0ae4b0f890210cb55a03e483ef15ba) Signed-off-by: Christopher Faulet --- include/haproxy/trace.h | 19 ++++++++++ src/trace.c | 78 ++++++++++++++++++++++++++++++----------- 2 files changed, 77 insertions(+), 20 deletions(-) diff --git a/include/haproxy/trace.h b/include/haproxy/trace.h index caf82a664..2c823e460 100644 --- a/include/haproxy/trace.h +++ b/include/haproxy/trace.h @@ -39,6 +39,15 @@ */ #define TRC_5ARGS(a0,a1,a2,a3,a4,a5,...) DEFNULL(a1),DEFNULL(a2),DEFNULL(a3),DEFNULL(a4),DEFNULL(a5) +/* reports whether trace is active for the source and the arguments. It uses + * the same criteria as trace() (locking, filtering etc) so it's safe to use + * from application code to decide whether or not to engage in heavier data + * preparation processing. + */ +#define _trace_enabled(level, mask, src, args...) \ + (unlikely((src)->state != TRACE_STATE_STOPPED && \ + __trace_enabled(level, mask, src, ##args, NULL) > 0)) + /* sends a trace for the given source. Arguments are passed in the exact same * order as in the __trace() function, which is only called if (src)->state is * not TRACE_STATE_STOPPED. This is the only case where arguments are evaluated. @@ -64,7 +73,12 @@ * before calling the __trace() function. _trace() shouldn't be a function (nor * inline) itself because we don't want the caller to compute its arguments if * traces are not enabled. + * + * TRACE_ENABLED() reports whether or not trace is enabled for the current + * source, level, mask and arguments. */ +#define TRACE_ENABLED(level, mask, args...) (_trace_enabled((level), (mask), TRACE_SOURCE, ist(TRC_LOC), __FUNCTION__, ##args)) + #define TRACE(msg, mask, args...) \ _trace(TRACE_LEVEL, (mask), TRACE_SOURCE, ist(TRC_LOC), NULL, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) @@ -122,6 +136,11 @@ extern struct list trace_sources; extern THREAD_LOCAL struct buffer trace_buf; +int __trace_enabled(enum trace_level level, uint64_t mask, struct trace_source *src, + const struct ist where, const char *func, + const void *a1, const void *a2, const void *a3, const void *a4, + const void **plockptr); + void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, const struct ist where, const char *func, const void *a1, const void *a2, const void *a3, const void *a4, diff --git a/src/trace.c b/src/trace.c index 5909dd438..27824c8e6 100644 --- a/src/trace.c +++ b/src/trace.c @@ -69,14 +69,24 @@ static inline const void *trace_pick_arg(uint32_t arg_def, const void *a1, const return NULL; } -/* write a message for the given trace source */ -void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, - const struct ist where, const char *func, - const void *a1, const void *a2, const void *a3, const void *a4, - void (*cb)(enum trace_level level, uint64_t mask, const struct trace_source *src, - const struct ist where, const struct ist func, - const void *a1, const void *a2, const void *a3, const void *a4), - const struct ist msg) +/* Reports whether the trace is enabled for the specified arguments, needs to enable + * or disable tracking. It gets the same API as __trace() except for and + * which are not used and were dropped, and plockptr which is an optional pointer to + * the lockptr to be updated (or NULL) for tracking. The function returns: + * 0 if the trace is not enabled for the module or these values + * <0 if the trace matches some locking criteria but don't have the proper level. + * In this case the interested caller might have to consider disabling tracking. + * >0 if the trace is enabled for the given criteria. + * In all cases, will only be set if non-null and if a locking criterion + * matched. It will be up to the caller to enable tracking if desired. A casual + * tester not interested in adjusting tracking (i.e. calling the function before + * deciding so prepare a buffer to be dumped) will only need to pass 0 for plockptr + * and check if the result is >0. + */ +int __trace_enabled(enum trace_level level, uint64_t mask, struct trace_source *src, + const struct ist where, const char *func, + const void *a1, const void *a2, const void *a3, const void *a4, + const void **plockptr) { const struct listener *li = NULL; const struct proxy *fe = NULL; @@ -90,17 +100,13 @@ void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, const struct quic_conn *qc = NULL; #endif const void *lockon_ptr = NULL; - struct ist ist_func = ist(func); - char tnum[4]; - struct ist line[12]; - int words = 0; if (likely(src->state == TRACE_STATE_STOPPED)) - return; + return 0; /* check that at least one action is interested by this event */ if (((src->report_events | src->start_events | src->pause_events | src->stop_events) & mask) == 0) - return; + return 0; /* retrieve available information from the caller's arguments */ if (src->arg_def & TRC_ARGS_CONN) @@ -161,7 +167,7 @@ void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, /* check if we need to start the trace now */ if (src->state == TRACE_STATE_WAITING) { if ((src->start_events & mask) == 0) - return; + return 0; /* TODO: add update of lockon+lockon_ptr here */ HA_ATOMIC_STORE(&src->state, TRACE_STATE_RUNNING); @@ -190,16 +196,48 @@ void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, } if (src->lockon_ptr && src->lockon_ptr != lockon_ptr) - return; + return 0; - if (!src->lockon_ptr && lockon_ptr && src->state == TRACE_STATE_RUNNING) - HA_ATOMIC_STORE(&src->lockon_ptr, lockon_ptr); + if (*plockptr && !src->lockon_ptr && lockon_ptr && src->state == TRACE_STATE_RUNNING) + *plockptr = lockon_ptr; } /* here the trace is running and is tracking a desired item */ + if ((src->report_events & mask) == 0 || level > src->level) { + /* tracking did match, and might have to be disabled */ + return -1; + } - if ((src->report_events & mask) == 0 || level > src->level) - goto end; + /* OK trace still enabled */ + return 1; +} + +/* write a message for the given trace source */ +void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, + const struct ist where, const char *func, + const void *a1, const void *a2, const void *a3, const void *a4, + void (*cb)(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4), + const struct ist msg) +{ + const void *lockon_ptr; + struct ist ist_func = ist(func); + char tnum[4]; + struct ist line[12]; + int words = 0; + int ret; + + lockon_ptr = NULL; + ret = __trace_enabled(level, mask, src, where, func, a1, a2, a3, a4, &lockon_ptr); + if (lockon_ptr) + HA_ATOMIC_STORE(&src->lockon_ptr, lockon_ptr); + + if (ret <= 0) { + if (ret < 0) // may have to disable tracking + goto end; + return; + } /* log the logging location truncated to 10 chars from the right so that * the line number and the end of the file name are there. From 180683bd2eb107ba75f62b4e2761925f50327f10 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 24 Jan 2023 18:03:07 +0100 Subject: [PATCH 137/140] MINOR: trace: add a trace_no_cb() dummy callback for when to use no callback By default, passing a NULL cb to the trace functions will result in the source's default one to be used. For some cases we won't want to use any callback at all, not event the default one. Let's define a trace_no_cb() function for this, that does absolutely nothing. (cherry picked from commit 4b36d5e8de50f729f5bfe10667c1f9138f756c11) [wt: will be needed for further QUIC patches] Signed-off-by: Willy Tarreau (cherry picked from commit a1425395f544d79dbf7124b13ed3825eec3f6269) Signed-off-by: Christopher Faulet --- include/haproxy/trace.h | 4 ++++ src/trace.c | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/include/haproxy/trace.h b/include/haproxy/trace.h index 2c823e460..2ddc8d587 100644 --- a/include/haproxy/trace.h +++ b/include/haproxy/trace.h @@ -149,6 +149,10 @@ void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, const void *a1, const void *a2, const void *a3, const void *a4), const struct ist msg); +void trace_no_cb(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + void trace_register_source(struct trace_source *source); /* return a single char to describe a trace state */ diff --git a/src/trace.c b/src/trace.c index 27824c8e6..9e0785c99 100644 --- a/src/trace.c +++ b/src/trace.c @@ -303,6 +303,14 @@ void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, } } +/* this callback may be used when no output modification is desired */ +void trace_no_cb(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4) +{ + /* do nothing */ +} + /* registers trace source . Modifies the list element! * The {start,pause,stop,report} events are not changed so the source may * preset them. From b701d69ee7858282e8e157a141d630a7394e0ad6 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 24 Jan 2023 18:23:59 +0100 Subject: [PATCH 138/140] MINOR: trace: add the long awaited TRACE_PRINTF() TRACE_PRINTF() can be used to produce arbitrary trace contents at any trace level. It uses the exact same arguments as other TRACE_* macros, but here they are mandatory since they are followed by the format-string, though they may be filled with zeroes. The reason for the arguments is to match tracking or filtering and not pollute other non-inspected objects. It will probably be used inside loops, in which case there are two points to be careful about: - output atomicity is only per-message, so competing threads may see their messages interleaved. As such, it is recommended that the caller places a recognizable unique context at the beginning of the message such as a connection pointer. - iterating over arrays or lists for all requests could be very expensive. In order to avoid this it is best to condition the call via TRACE_ENABLED() with the same arguments, which will return the same decision. - messages longer than TRACE_MAX_MSG-1 (1023 by default) will be truncated. For example, in order to dump the list of HTTP headers between hpack and h2: if (outlen > 0 && TRACE_ENABLED(TRACE_LEVEL_DEVELOPER, H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, 0, 0, 0)) { int i; for (i = 0; list[i].n.len; i++) TRACE_PRINTF(TRACE_LEVEL_DEVELOPER, H2_EV_RX_FRAME|H2_EV_RX_HDR, h2c->conn, 0, 0, 0, "h2c=%p hdr[%d]=%s:%s", h2c, i, list[i].n.ptr, list[i].v.ptr); } In addition, a lower-level TRACE_PRINTF_LOC() macro is provided, that takes two extra arguments, the caller's location and the caller's function name. This will allow to emit composite traces from central functions on the behalf of another one. (cherry picked from commit b8b243ac6acd849cbfb29a99c9e295886834eef4) [wt: this will be needed by further QUIC patches] Signed-off-by: Willy Tarreau (cherry picked from commit 37a13cb0a72836194f48a2b013a1dc0d349ccff2) Signed-off-by: Christopher Faulet --- include/haproxy/defaults.h | 5 +++++ include/haproxy/trace.h | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h index 5fcd4ba98..fa44384fa 100644 --- a/include/haproxy/defaults.h +++ b/include/haproxy/defaults.h @@ -134,6 +134,11 @@ #define MAX_HDR_HISTORY 10 #endif +// max length of a TRACE_PRINTF() output buffer (one less char for the message) +#ifndef TRACE_MAX_MSG +#define TRACE_MAX_MSG 1024 +#endif + // max # of stick counters per session (at least 3 for sc0..sc2) #ifndef MAX_SESS_STKCTR #define MAX_SESS_STKCTR 3 diff --git a/include/haproxy/trace.h b/include/haproxy/trace.h index 2ddc8d587..bf9885df3 100644 --- a/include/haproxy/trace.h +++ b/include/haproxy/trace.h @@ -67,7 +67,9 @@ * simply omitted (in which case they will be replaced by a NULL). This * ordering allows many TRACE() calls to be placed using copy-paste and just * change the message at the beginning. Only TRACE_DEVEL(), TRACE_ENTER() and - * TRACE_LEAVE() will report the calling function's name. + * TRACE_LEAVE() will report the calling function's name. TRACE_PRINTF() does + * require all the optional a1..a4 to be passed (possibly zero) so that they're + * always followed by the format string, then the values to be formatted. * * TRACE_* will call the _trace() macro which will test if the trace is enabled * before calling the __trace() function. _trace() shouldn't be a function (nor @@ -109,6 +111,35 @@ #define TRACE_POINT(mask, args...) \ _trace(TRACE_LEVEL_DEVELOPER, (mask), TRACE_SOURCE, ist(TRC_LOC), __FUNCTION__, TRC_5ARGS(0,##args,0,0,0,0,0), ist("in")) +/* This produces a printf-like trace at level for event mask and + * trace arguments . All args mandatory, but may be zero. No output + * callback will be used since we expect the caller to pass a fully formatted + * message that must not be degraded. The output will be truncated to + * TRACE_MAX_MSG-1 bytes (1023 by default). Caller must include for + * snprintf(). One call will lead to one independent message, which means that + * multiple messages may be interleaved between threads, hence the caller is + * encouraged to prepend a context at the beginning of the format string when + * dumping lists or arrays. The _LOC variation takes the caller's location and + * function name as an ist and a (const char *) respectively, it is meant for + * being called from wrapper function which will work on behalf of a caller. + */ +#define TRACE_PRINTF(level, mask, a1, a2, a3, a4, fmt, args...) \ + TRACE_PRINTF_LOC(level, mask, ist(TRC_LOC), __FUNCTION__, a1, a2, a3, a4, fmt, ##args) + +#define TRACE_PRINTF_LOC(level, mask, trc_loc, func, a1, a2, a3, a4, fmt, args...) \ + do { \ + if (TRACE_ENABLED((level), (mask), a1, a2, a3, a4)) { \ + char _msg[TRACE_MAX_MSG]; \ + size_t _msg_len; \ + _msg_len = snprintf(_msg, sizeof(_msg), (fmt), ##args); \ + if (_msg_len >= sizeof(_msg)) \ + _msg_len = sizeof(_msg) - 1; \ + _trace(TRACE_LEVEL_DEVELOPER, (mask), TRACE_SOURCE, \ + trc_loc, func, a1, a2, a3, a4, \ + &trace_no_cb, ist2(_msg, _msg_len)); \ + } \ + } while (0) + #if defined(DEBUG_DEV) || defined(DEBUG_FULL) # define DBG_TRACE(msg, mask, args...) TRACE(msg, mask, ##args) # define DBG_TRACE_ERROR(msg, mask, args...) TRACE_ERROR(msg, mask, ##args) @@ -120,6 +151,8 @@ # define DBG_TRACE_ENTER(mask, args...) TRACE_ENTER(mask, ##args) # define DBG_TRACE_LEAVE(mask, args...) TRACE_LEAVE(mask, ##args) # define DBG_TRACE_POINT(mask, args...) TRACE_POINT(mask, ##args) +# define DBG_TRACE_PRINTF(level, args...) TRACE_PRINTF(level, ##args) +# define DBG_TRACE_PRINTF_LOC(level, args...) TRACE_PRINTF_LOC(level, ##args) #else # define DBG_TRACE(msg, mask, args...) do { /* do nothing */ } while(0) # define DBG_TRACE_ERROR(msg, mask, args...) do { /* do nothing */ } while(0) @@ -131,6 +164,8 @@ # define DBG_TRACE_ENTER(mask, args...) do { /* do nothing */ } while(0) # define DBG_TRACE_LEAVE(mask, args...) do { /* do nothing */ } while(0) # define DBG_TRACE_POINT(mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_PRINTF(level, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_PRINTF_LOC(level, args...) do { /* do nothing */ } while(0) #endif extern struct list trace_sources; From 251debfe0c83c129b29ca75204ded9d9981fba9a Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 17 Mar 2023 16:40:09 +0100 Subject: [PATCH 139/140] BUG/MAJOR: qpack: fix possible read out of bounds in static table CertiK Skyfall Team reported that passing an index greater than QPACK_SHT_SIZE in a qpack instruction referencing a literal field name with name reference or and indexed field line will cause a read out of bounds that may crash the process, and confirmed that this fix addresses the issue. This needs to be backported as far as 2.5. (cherry picked from commit f41dfc22b20b2b5c295e8d80e062b896e7153b88) Signed-off-by: Willy Tarreau (cherry picked from commit aac7a54fe0779cee666ec5cae0fe00e7c17faa36) Signed-off-by: Willy Tarreau --- src/qpack-dec.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qpack-dec.c b/src/qpack-dec.c index 2d8115645..a6e292327 100644 --- a/src/qpack-dec.c +++ b/src/qpack-dec.c @@ -335,7 +335,7 @@ int qpack_decode_fs(const unsigned char *raw, uint64_t len, struct buffer *tmp, goto out; } - if (static_tbl) { + if (static_tbl && index < QPACK_SHT_SIZE) { name = qpack_sht[index].n; value = qpack_sht[index].v; } @@ -370,7 +370,7 @@ int qpack_decode_fs(const unsigned char *raw, uint64_t len, struct buffer *tmp, goto out; } - if (static_tbl) { + if (static_tbl && index < QPACK_SHT_SIZE) { name = qpack_sht[index].n; } else { From 3c6be690bb444eeb307d1f84f2ebb1e02ac15c29 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Fri, 17 Mar 2023 17:36:07 +0100 Subject: [PATCH 140/140] [RELEASE] Released version 2.6.11 Released version 2.6.11 with the following main changes : - BUG/MEDIUM: proxy: properly stop backends on soft-stop - BUG/MEDIUM: resolvers: Properly stop server resolutions on soft-stop - DEBUG: cli/show_fd: Display connection error code - DEBUG: ssl-sock/show_fd: Display SSL error code - BUG/MINOR: tcp_sample: fix a bug in fc_dst_port and fc_dst_is_local sample fetches - BUG/MINOR: quic: Missing STREAM frame length updates - BUG/MEDIUM: connection: Preserve flags when a conn is removed from an idle list - BUG/MINOR: mux-h2: make sure the h2c task exists before refreshing it - MINOR: buffer: add br_single() to check if a buffer ring has more than one buf - BUG/MEDIUM: mux-h2: only restart sending when mux buffer is decongested - BUG/MINOR: mux-h2: set CO_SFL_STREAMER when sending lots of data - BUG/MINOR: quic: Missing STREAM frame data pointer updates - BUG/MEDIUM: listener: duplicate inherited FDs if needed - MINOR: h2: add h2_phdr_to_ist() to make ISTs from pseudo headers - MEDIUM: mux-h2/trace: add tracing support for headers - BUG/MINOR: mux-h2: Fix possible null pointer deref on h2c in _h2_trace_header() - BUG/MEDIUM: spoe: Don't set the default traget for the SPOE agent frontend - BUG/MINOR: proto_ux: report correct error when bind_listener fails - BUG/MINOR: protocol: fix minor memory leak in protocol_bind_all() - BUG/MINOR: sock_unix: match finalname with tempname in sock_unix_addrcmp() - MINOR: trace: add a TRACE_ENABLED() macro to determine if a trace is active - MINOR: trace: add a trace_no_cb() dummy callback for when to use no callback - MINOR: trace: add the long awaited TRACE_PRINTF() - BUG/MAJOR: qpack: fix possible read out of bounds in static table --- CHANGELOG | 26 ++++++++++++++++++++++++++ VERDATE | 2 +- VERSION | 2 +- doc/configuration.txt | 2 +- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index e89a3faa7..180abdb71 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,32 @@ ChangeLog : =========== +2023/03/17 : 2.6.11 + - BUG/MEDIUM: proxy: properly stop backends on soft-stop + - BUG/MEDIUM: resolvers: Properly stop server resolutions on soft-stop + - DEBUG: cli/show_fd: Display connection error code + - DEBUG: ssl-sock/show_fd: Display SSL error code + - BUG/MINOR: tcp_sample: fix a bug in fc_dst_port and fc_dst_is_local sample fetches + - BUG/MINOR: quic: Missing STREAM frame length updates + - BUG/MEDIUM: connection: Preserve flags when a conn is removed from an idle list + - BUG/MINOR: mux-h2: make sure the h2c task exists before refreshing it + - MINOR: buffer: add br_single() to check if a buffer ring has more than one buf + - BUG/MEDIUM: mux-h2: only restart sending when mux buffer is decongested + - BUG/MINOR: mux-h2: set CO_SFL_STREAMER when sending lots of data + - BUG/MINOR: quic: Missing STREAM frame data pointer updates + - BUG/MEDIUM: listener: duplicate inherited FDs if needed + - MINOR: h2: add h2_phdr_to_ist() to make ISTs from pseudo headers + - MEDIUM: mux-h2/trace: add tracing support for headers + - BUG/MINOR: mux-h2: Fix possible null pointer deref on h2c in _h2_trace_header() + - BUG/MEDIUM: spoe: Don't set the default traget for the SPOE agent frontend + - BUG/MINOR: proto_ux: report correct error when bind_listener fails + - BUG/MINOR: protocol: fix minor memory leak in protocol_bind_all() + - BUG/MINOR: sock_unix: match finalname with tempname in sock_unix_addrcmp() + - MINOR: trace: add a TRACE_ENABLED() macro to determine if a trace is active + - MINOR: trace: add a trace_no_cb() dummy callback for when to use no callback + - MINOR: trace: add the long awaited TRACE_PRINTF() + - BUG/MAJOR: qpack: fix possible read out of bounds in static table + 2023/03/10 : 2.6.10 - BUG/MINOR: mworker: stop doing strtok directly from the env - BUG/MEDIUM: mworker: prevent inconsistent reload when upgrading from old versions diff --git a/VERDATE b/VERDATE index 941e9fc00..34e26dcc1 100644 --- a/VERDATE +++ b/VERDATE @@ -1,2 +1,2 @@ $Format:%ci$ -2023/03/10 +2023/03/17 diff --git a/VERSION b/VERSION index a04abec91..bc02b8685 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.10 +2.6.11 diff --git a/doc/configuration.txt b/doc/configuration.txt index 9e0b1dbb4..cfceedb76 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -3,7 +3,7 @@ Configuration Manual ---------------------- version 2.6 - 2023/03/10 + 2023/03/17 This document covers the configuration language as implemented in the version