From 6318d33ce62580d6ba6d418890e69300715291c4 Mon Sep 17 00:00:00 2001 From: William Dauchy Date: Sat, 2 May 2020 21:52:36 +0200 Subject: [PATCH] BUG/MEDIUM: connections: force connections cleanup on server changes I've been trying to understand a change of behaviour between v2.2dev5 and v2.2dev6. Indeed our probe is regularly testing to add and remove servers on a given backend such as: # echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats 113 be_foo 1 srv0 10.236.139.34 2 0 1 1 263 15 3 4 6 0 0 0 - 31255 - 113 be_foo 2 srv1 0.0.0.0 0 1 256 256 0 15 3 0 14 0 0 0 - 0 - -> curl on the corresponding frontend: reply from server:31255 # echo "set server be_foo/srv1 addr 10.236.139.34 port 31257" | sudo socat stdio /var/lib/haproxy/stats IP changed from '0.0.0.0' to '10.236.139.34', port changed from '0' to '31257' by 'stats socket command' # echo "set server be_foo/srv1 weight 256" | sudo socat stdio /var/lib/haproxy/stats # echo "set server be_foo/srv1 check-port 8500" | sudo socat stdio /var/lib/haproxy/stats health check port updated. # echo "set server be_foo/srv1 state ready" | sudo socat stdio /var/lib/haproxy/stats # echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats 113 be_foo 1 srv0 10.236.139.34 2 0 1 1 105 15 3 4 6 0 0 0 - 31255 - 113 be_foo 2 srv1 10.236.139.34 2 0 256 256 2319 15 3 2 6 0 0 0 - 31257 - -> curl on the corresponding frontend: reply for server:31257 (notice the difference of weight) # echo "set server be_foo/srv1 state maint" | sudo socat stdio /var/lib/haproxy/stats # echo "set server be_foo/srv1 addr 0.0.0.0 port 0" | sudo socat stdio /var/lib/haproxy/stats IP changed from '10.236.139.34' to '0.0.0.0', port changed from '31257' to '0' by 'stats socket command' # echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats 113 be_foo 1 srv0 10.236.139.34 2 0 1 1 263 15 3 4 6 0 0 0 - 31255 - 113 be_foo 2 srv1 0.0.0.0 0 1 256 256 0 15 3 0 14 0 0 0 - 0 - -> curl on the corresponding frontend: reply from server:31255 # echo "set server be_foo/srv1 addr 10.236.139.34 port 31256" | sudo socat stdio /var/lib/haproxy/stats IP changed from '0.0.0.0' to '10.236.139.34', port changed from '0' to '31256' by 'stats socket command' # echo "set server be_foo/srv1 weight 256" | sudo socat stdio /var/lib/haproxy/stats # echo "set server be_foo/srv1 check-port 8500" | sudo socat stdio /var/lib/haproxy/stats health check port updated. # echo "set server be_foo/srv1 state ready" | sudo socat stdio /var/lib/haproxy/stats # echo "show servers state be_foo" | sudo socat stdio /var/lib/haproxy/stats 113 be_foo 1 srv0 10.236.139.34 2 0 1 1 105 15 3 4 6 0 0 0 - 31255 - 113 be_foo 2 srv1 10.236.139.34 2 0 256 256 2319 15 3 2 6 0 0 0 - 31256 - -> curl on the corresponding frontend: reply from server:31257 (!) Here we indeed would expect to get an anver from server:31256. The issue is highly linked to the usage of `pool-purge-delay`, with a value which is higher than the duration of the test, 10s in our case. a git bisect between dev5 and dev6 seems to show commit 079cb9af22da6 ("MEDIUM: connections: Revamp the way idle connections are killed") being the origin of this new behaviour. So if I understand the later correctly, it seems that it was more a matter of chance that we did not saw the issue earlier. My patch proposes to force clean idle connections in the two following cases: - we set a (still running) server to maintenance - we change the ip/port of a server This commit should be backported to 2.1, 2.0, and 1.9. Signed-off-by: William Dauchy --- src/server.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/server.c b/src/server.c index 4d5e706d3..94e7aeed1 100644 --- a/src/server.c +++ b/src/server.c @@ -55,6 +55,7 @@ static int srv_apply_lastaddr(struct server *srv, int *err_code); static int srv_set_fqdn(struct server *srv, const char *fqdn, int dns_locked); static void srv_state_parse_line(char *buf, const int version, char **params, char **srv_params); static int srv_state_get_version(FILE *f); +static void srv_cleanup_connections(struct server *srv); /* List head of all known server keywords */ static struct srv_kw_list srv_keywords = { @@ -3669,8 +3670,11 @@ const char *update_server_addr_port(struct server *s, const char *addr, const ch } out: - if (changed) + if (changed) { + /* force connection cleanup on the given server */ + srv_cleanup_connections(s); srv_set_dyncookie(s); + } if (updater) chunk_appendf(msg, " by '%s'", updater); chunk_appendf(msg, "\n"); @@ -4832,6 +4836,8 @@ static void srv_update_status(struct server *s) if (s->onmarkeddown & HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS) srv_shutdown_streams(s, SF_ERR_DOWN); + /* force connection cleanup on the given server */ + srv_cleanup_connections(s); /* we might have streams queued on this server and waiting for * a connection. Those which are redispatchable will be queued * to another server or to the proxy itself. @@ -5160,6 +5166,37 @@ struct task *srv_cleanup_toremove_connections(struct task *task, void *context, return task; } +/* cleanup connections for a given server + * might be useful when going on forced maintenance or live changing ip/port + */ +void srv_cleanup_connections(struct server *srv) +{ + struct connection *conn; + int did_remove; + int i; + int j; + + HA_SPIN_LOCK(OTHER_LOCK, &idle_conn_srv_lock); + for (i = 0; i < global.nbthread; i++) { + did_remove = 0; + HA_SPIN_LOCK(OTHER_LOCK, &toremove_lock[i]); + for (j = 0; j < srv->curr_idle_conns; j++) { + conn = MT_LIST_POP(&srv->idle_conns[i], struct connection *, list); + if (!conn) + conn = MT_LIST_POP(&srv->safe_conns[i], + struct connection *, list); + if (!conn) + break; + did_remove = 1; + MT_LIST_ADDQ(&toremove_connections[i], (struct mt_list *)&conn->list); + } + HA_SPIN_UNLOCK(OTHER_LOCK, &toremove_lock[i]); + if (did_remove) + task_wakeup(idle_conn_cleanup[i], TASK_WOKEN_OTHER); + } + HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conn_srv_lock); +} + struct task *srv_cleanup_idle_connections(struct task *task, void *context, unsigned short state) { struct server *srv;