[MEDIUM] stats: report server and backend cumulated downtime
Hello, This patch implements new statistics for SLA calculation by adding new field 'Dwntime' with total down time since restart (both HTTP/CSV) and extending status field (HTTP) or inserting a new one (CSV) with time showing how long each server/backend is in a current state. Additionaly, down transations are also calculated and displayed for backends, so it is possible to know how many times selected backend was down, generating "No server is available to handle this request." error. New information are presentetd in two different ways: - for HTTP: a "human redable form", one of "100000d 23h", "23h 59m" or "59m 59s" - for CSV: seconds I believe that seconds resolution is enough. As there are more columns in the status page I decided to shrink some names to make more space: - Weight -> Wght - Check -> Chk - Down -> Dwn Making described changes I also made some improvements and fixed some small bugs: - don't increment s->health above 's->rise + s->fall - 1'. Previously it was incremented an then (re)set to 's->rise + s->fall - 1'. - do not set server down if it is down already - do not set server up if it is up already - fix colspan in multiple places (mostly introduced by my previous patch) - add missing "status" header to CSV - fix order of retries/redispatches in server (CSV) - s/Tthen/Then/ - s/server/backend/ in DATA_ST_PX_BE (dumpstats.c) Changes from previous version: - deal with negative time intervales - don't relay on s->state (SRV_RUNNING) - little reworked human_time + compacted format (no spaces). If needed it can be used in the future for other purposes by optionally making "cnt" as an argument - leave set_server_down mostly unchanged - only little reworked "process_chk: 9" - additional fields in CSV are appended to the rigth - fix "SEC" macro - named arguments (human_time, be_downtime, srv_downtime) Hope it is OK. If there are only cosmetic changes needed please fill free to correct it, however if there are some bigger changes required I would like to discuss it first or at last to know what exactly was changed especially since I already put this patch into my production server. :) Thank you, Best regards, Krzysztof Oledzki
This commit is contained in:
parent
365d1cd84c
commit
85130941e7
@ -27,6 +27,11 @@
|
||||
#include <common/config.h>
|
||||
#include <common/standard.h>
|
||||
|
||||
#define SEC 1
|
||||
#define MINUTE (60 * SEC)
|
||||
#define HOUR (60 * MINUTE)
|
||||
#define DAY (24 * HOUR)
|
||||
|
||||
/* eternity when exprimed in timeval */
|
||||
#ifndef TV_ETERNITY
|
||||
#define TV_ETERNITY (~0UL)
|
||||
@ -480,6 +485,7 @@ REGPRM3 static inline struct timeval *__tv_ms_add(struct timeval *tv, const stru
|
||||
tv1; \
|
||||
})
|
||||
|
||||
char *human_time(int t, short hz);
|
||||
|
||||
#endif /* _COMMON_TIME_H */
|
||||
|
||||
|
@ -39,7 +39,7 @@ int srv_redispatch_connect(struct session *t);
|
||||
|
||||
void recount_servers(struct proxy *px);
|
||||
void recalc_server_map(struct proxy *px);
|
||||
|
||||
int be_downtime(struct proxy *px);
|
||||
|
||||
/*
|
||||
* This function tries to find a running server with free connection slots for
|
||||
|
@ -31,7 +31,7 @@
|
||||
|
||||
#include <proto/queue.h>
|
||||
|
||||
|
||||
int srv_downtime(struct server *s);
|
||||
|
||||
#endif /* _PROTO_SERVER_H */
|
||||
|
||||
|
@ -111,6 +111,11 @@ struct proxy {
|
||||
unsigned int maxconn; /* max # of active sessions on the frontend */
|
||||
unsigned int fullconn; /* #conns on backend above which servers are used at full load */
|
||||
struct in_addr except_net, except_mask; /* don't x-forward-for for this address. FIXME: should support IPv6 */
|
||||
|
||||
unsigned down_trans; /* up-down transitions */
|
||||
unsigned down_time; /* total time the proxy was down */
|
||||
time_t last_change; /* last time, when the state was changed */
|
||||
|
||||
unsigned failed_conns, failed_resp; /* failed connect() and responses */
|
||||
unsigned retries, redispatches; /* retried and redispatched connections */
|
||||
unsigned denied_req, denied_resp; /* blocked requests/responses because of security concerns */
|
||||
|
@ -86,10 +86,14 @@ struct server {
|
||||
unsigned wscore; /* weight score, used during srv map computation */
|
||||
|
||||
unsigned failed_checks, down_trans; /* failed checks and up-down transitions */
|
||||
unsigned down_time; /* total time the server was down */
|
||||
time_t last_change; /* last time, when the state was changed */
|
||||
|
||||
unsigned failed_conns, failed_resp; /* failed connect() and responses */
|
||||
unsigned retries; /* retried connections */
|
||||
unsigned failed_secu; /* blocked responses because of security concerns */
|
||||
unsigned cum_sess; /* cumulated number of sessions really sent to this server */
|
||||
|
||||
long long bytes_in; /* number of bytes transferred from the client to the server */
|
||||
long long bytes_out; /* number of bytes transferred from the server to the client */
|
||||
};
|
||||
|
@ -717,6 +717,13 @@ int srv_redispatch_connect(struct session *t)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int be_downtime(struct proxy *px) {
|
||||
|
||||
if ((px->srv_act || px->srv_bck) && px->last_change < now.tv_sec) // ignore negative time
|
||||
return px->down_time;
|
||||
|
||||
return now.tv_sec - px->last_change + px->down_time;
|
||||
}
|
||||
|
||||
/*
|
||||
* Local variables:
|
||||
|
@ -564,7 +564,8 @@ int cfg_parse_listen(const char *file, int linenum, char **args)
|
||||
tv_eternity(&curproxy->srvtimeout);
|
||||
tv_eternity(&curproxy->contimeout);
|
||||
tv_eternity(&curproxy->appsession_timeout);
|
||||
|
||||
|
||||
curproxy->last_change = now.tv_sec;
|
||||
curproxy->id = strdup(args[1]);
|
||||
curproxy->cap = rc;
|
||||
|
||||
@ -1380,6 +1381,7 @@ int cfg_parse_listen(const char *file, int linenum, char **args)
|
||||
LIST_INIT(&newsrv->pendconns);
|
||||
do_check = 0;
|
||||
newsrv->state = SRV_RUNNING; /* early server setup */
|
||||
newsrv->last_change = now.tv_sec;
|
||||
newsrv->id = strdup(args[1]);
|
||||
|
||||
/* several ways to check the port component :
|
||||
|
35
src/checks.c
35
src/checks.c
@ -57,9 +57,11 @@ static void set_server_down(struct server *s)
|
||||
struct session *sess;
|
||||
int xferred;
|
||||
|
||||
s->state &= ~SRV_RUNNING;
|
||||
|
||||
if (s->health == s->rise) {
|
||||
|
||||
s->last_change = now.tv_sec;
|
||||
s->state &= ~SRV_RUNNING;
|
||||
|
||||
recount_servers(s->proxy);
|
||||
s->proxy->map_state |= PR_MAP_RECALC;
|
||||
|
||||
@ -92,10 +94,13 @@ static void set_server_down(struct server *s)
|
||||
|
||||
Warning("%s", trash);
|
||||
send_log(s->proxy, LOG_ALERT, "%s", trash);
|
||||
|
||||
|
||||
if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
|
||||
Alert("%s '%s' has no server available !\n", proxy_type_str(s->proxy), s->proxy->id);
|
||||
send_log(s->proxy, LOG_EMERG, "%s %s has no server available !\n", proxy_type_str(s->proxy), s->proxy->id);
|
||||
s->proxy->last_change = now.tv_sec;
|
||||
s->proxy->down_trans++;
|
||||
|
||||
Alert("%s '%s' has no server available!\n", proxy_type_str(s->proxy), s->proxy->id);
|
||||
send_log(s->proxy, LOG_EMERG, "%s %s has no server available!\n", proxy_type_str(s->proxy), s->proxy->id);
|
||||
}
|
||||
s->down_trans++;
|
||||
}
|
||||
@ -455,13 +460,24 @@ void process_chk(struct task *t, struct timeval *next)
|
||||
/* there was a test running */
|
||||
if (s->result > 0) { /* good server detected */
|
||||
//fprintf(stderr, "process_chk: 9\n");
|
||||
s->health++; /* was bad, stays for a while */
|
||||
if (s->health >= s->rise) {
|
||||
s->state |= SRV_RUNNING;
|
||||
|
||||
if (s->health < s->rise + s->fall - 1) {
|
||||
s->health++; /* was bad, stays for a while */
|
||||
|
||||
if (s->health == s->rise) {
|
||||
int xferred;
|
||||
|
||||
if (s->last_change < now.tv_sec) // ignore negative times
|
||||
s->down_time += now.tv_sec - s->last_change;
|
||||
s->last_change = now.tv_sec;
|
||||
s->state |= SRV_RUNNING;
|
||||
|
||||
if (s->proxy->srv_bck == 0 && s->proxy->srv_act == 0) {
|
||||
if (s->proxy->last_change < now.tv_sec) // ignore negative times
|
||||
s->proxy->down_time += now.tv_sec - s->proxy->last_change;
|
||||
s->proxy->last_change = now.tv_sec;
|
||||
}
|
||||
|
||||
recount_servers(s->proxy);
|
||||
s->proxy->map_state |= PR_MAP_RECALC;
|
||||
|
||||
@ -493,7 +509,8 @@ void process_chk(struct task *t, struct timeval *next)
|
||||
send_log(s->proxy, LOG_NOTICE, "%s", trash);
|
||||
}
|
||||
|
||||
s->health = s->rise + s->fall - 1; /* OK now */
|
||||
if (s->health >= s->rise)
|
||||
s->health = s->rise + s->fall - 1; /* OK now */
|
||||
}
|
||||
s->curfd = -1; /* no check running anymore */
|
||||
fd_delete(fd);
|
||||
|
@ -16,7 +16,6 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <pwd.h>
|
||||
#include <grp.h>
|
||||
|
||||
@ -47,6 +46,7 @@
|
||||
#include <proto/proto_uxst.h>
|
||||
#include <proto/senddata.h>
|
||||
#include <proto/session.h>
|
||||
#include <proto/server.h>
|
||||
|
||||
/* This function parses a "stats" statement in the "global" section. It returns
|
||||
* -1 if there is any error, otherwise zero. If it returns -1, it may write an
|
||||
@ -186,8 +186,8 @@ int stats_dump_raw(struct session *s, struct uri_auth *uri, int flags)
|
||||
"dreq,dresp,"
|
||||
"ereq,econ,eresp,"
|
||||
"wretr,wredis,"
|
||||
"weight,act,bck,"
|
||||
"chkfail,chkdown"
|
||||
"status,weight,act,bck,"
|
||||
"chkfail,chkdown,lastchg,downtime,"
|
||||
"\n");
|
||||
|
||||
if (buffer_write_chunk(rep, &msg) != 0)
|
||||
@ -371,8 +371,8 @@ int stats_dump_http(struct session *s, struct uri_auth *uri, int flags)
|
||||
"dreq,dresp,"
|
||||
"ereq,econ,eresp,"
|
||||
"wretr,wredis,"
|
||||
"weight,act,bck,"
|
||||
"chkfail,chkdown"
|
||||
"status,weight,act,bck,"
|
||||
"chkfail,chkdown,lastchg,downtime,"
|
||||
"\n");
|
||||
}
|
||||
if (buffer_write_chunk(rep, &msg) != 0)
|
||||
@ -582,26 +582,26 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
if (flags & STAT_FMT_HTML) {
|
||||
/* print a new table */
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
"<table cols=\"22\" class=\"tbl\" width=\"100%%\">\n"
|
||||
"<table cols=\"23\" class=\"tbl\" width=\"100%%\">\n"
|
||||
"<tr align=\"center\" class=\"titre\">"
|
||||
"<th colspan=2 class=\"pxname\">%s</th>"
|
||||
"<th colspan=18 class=\"empty\"></th>"
|
||||
"<th colspan=21 class=\"empty\"></th>"
|
||||
"</tr>\n"
|
||||
"<tr align=\"center\" class=\"titre\">"
|
||||
"<th rowspan=2></th>"
|
||||
"<th colspan=2>Queue</th><th colspan=4>Sessions</th>"
|
||||
"<th colspan=2>Bytes</th><th colspan=2>Denied</th>"
|
||||
"<th colspan=3>Errors</th><th colspan=2>Warnings</th>"
|
||||
"<th colspan=6>Server</th>"
|
||||
"<th colspan=7>Server</th>"
|
||||
"</tr>\n"
|
||||
"<tr align=\"center\" class=\"titre\">"
|
||||
"<th>Cur</th><th>Max</th><th>Cur</th><th>Max</th>"
|
||||
"<th>Limit</th><th>Cumul</th><th>In</th><th>Out</th>"
|
||||
"<th>Req</th><th>Resp</th><th>Req</th><th>Conn</th>"
|
||||
"<th>Resp</th><th>Retr</th><th>Redis</th>"
|
||||
"<th>Status</th><th>Weight</th><th>Act</th>"
|
||||
"<th>Bck</th><th>Check</th><th>Down</th></tr>\n"
|
||||
"",
|
||||
"<th>Status</th><th>Wght</th><th>Act</th>"
|
||||
"<th>Bck</th><th>Chk</th><th>Dwn</th><th>Dwntme</th>\n"
|
||||
"</tr>",
|
||||
px->id);
|
||||
|
||||
if (buffer_write_chunk(rep, &msg) != 0)
|
||||
@ -632,7 +632,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
/* server status : reflect frontend status */
|
||||
"<td align=center>%s</td>"
|
||||
/* rest of server: nothing */
|
||||
"<td align=center colspan=5></td></tr>"
|
||||
"<td align=center colspan=6></td></tr>"
|
||||
"",
|
||||
px->feconn, px->feconn_max, px->maxconn, px->cum_feconn,
|
||||
px->bytes_in, px->bytes_out,
|
||||
@ -657,7 +657,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
/* server status : reflect frontend status */
|
||||
"%s,"
|
||||
/* rest of server: nothing */
|
||||
",,,,,"
|
||||
",,,,,,,"
|
||||
"\n",
|
||||
px->id,
|
||||
px->feconn, px->feconn_max, px->maxconn, px->cum_feconn,
|
||||
@ -734,6 +734,11 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
|
||||
/* status */
|
||||
chunk_printf(&msg, sizeof(trash), "<td nowrap>");
|
||||
|
||||
if (sv->state & SRV_CHECKED)
|
||||
chunk_printf(&msg, sizeof(trash), "%s ",
|
||||
human_time(now.tv_sec - sv->last_change, 1));
|
||||
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
srv_hlt_st[sv_state],
|
||||
(sv->state & SRV_RUNNING) ? (sv->health - sv->rise + 1) : (sv->health),
|
||||
@ -749,14 +754,17 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
(sv->state & SRV_BACKUP) ? "-" : "Y",
|
||||
(sv->state & SRV_BACKUP) ? "Y" : "-");
|
||||
|
||||
/* check failures : unique, fatal */
|
||||
/* check failures: unique, fatal, down time */
|
||||
if (sv->state & SRV_CHECKED)
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
"<td align=right>%d</td><td align=right>%d</td></tr>\n",
|
||||
sv->failed_checks, sv->down_trans);
|
||||
"<td align=right>%d</td><td align=right>%d</td>"
|
||||
"<td nowrap align=right>%s</td>"
|
||||
"</tr>\n",
|
||||
sv->failed_checks, sv->down_trans,
|
||||
human_time(srv_downtime(sv), 1));
|
||||
else
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
"<td colspan=2></td></tr>\n");
|
||||
"<td colspan=3></td></tr>\n");
|
||||
} else {
|
||||
static char *srv_hlt_st[5] = { "DOWN,", "DOWN %d/%d,", "UP %d/%d,",
|
||||
"UP,", "no check," };
|
||||
@ -774,7 +782,7 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
/* errors : request, connect, response */
|
||||
",%d,%d,"
|
||||
/* warnings: retries, redispatches */
|
||||
",%d,"
|
||||
"%d,,"
|
||||
"",
|
||||
px->id, sv->id,
|
||||
sv->nbpend, sv->nbpend_max,
|
||||
@ -798,14 +806,15 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
(sv->state & SRV_BACKUP) ? 0 : 1,
|
||||
(sv->state & SRV_BACKUP) ? 1 : 0);
|
||||
|
||||
/* check failures : unique, fatal */
|
||||
/* check failures: unique, fatal; last change, total downtime */
|
||||
if (sv->state & SRV_CHECKED)
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
"%d,%d,\n",
|
||||
sv->failed_checks, sv->down_trans);
|
||||
"%d,%d,%d,%d,\n",
|
||||
sv->failed_checks, sv->down_trans,
|
||||
now.tv_sec - sv->last_change, srv_downtime(sv));
|
||||
else
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
",,\n");
|
||||
",,,,\n");
|
||||
}
|
||||
if (buffer_write_chunk(rep, &msg) != 0)
|
||||
return 0;
|
||||
@ -847,23 +856,29 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
"<td align=right></td><td align=right>%d</td><td align=right>%d</td>\n"
|
||||
/* warnings: retries, redispatches */
|
||||
"<td align=right>%d</td><td align=right>%d</td>"
|
||||
/* server status : reflect backend status (up/down) : we display UP
|
||||
/* backend status: reflect backend status (up/down): we display UP
|
||||
* if the backend has known working servers or if it has no server at
|
||||
* all (eg: for stats). Tthen we display the total weight, number of
|
||||
* all (eg: for stats). Then we display the total weight, number of
|
||||
* active and backups. */
|
||||
"<td align=center>%s</td><td align=center>%d</td>"
|
||||
"<td align=center>%d</td><td align=center>%d</td>"
|
||||
/* rest of server: nothing */
|
||||
"<td align=center colspan=2></td></tr>"
|
||||
"",
|
||||
"<td align=center nowrap>%s %s</td><td align=center>%d</td>"
|
||||
"<td align=center>%d</td><td align=center>%d</td>",
|
||||
px->nbpend /* or px->totpend ? */, px->nbpend_max,
|
||||
px->beconn, px->beconn_max, px->fullconn, px->cum_beconn,
|
||||
px->bytes_in, px->bytes_out,
|
||||
px->denied_req, px->denied_resp,
|
||||
px->failed_conns, px->failed_resp,
|
||||
px->retries, px->redispatches,
|
||||
human_time(now.tv_sec - px->last_change, 1),
|
||||
(px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN",
|
||||
px->srv_map_sz * gcd, px->srv_act, px->srv_bck);
|
||||
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
/* rest of backend: nothing, down transformations, total downtime */
|
||||
"<td align=center> </td><td align=\"right\">%d</td>"
|
||||
"<td align=\"right\" nowrap>%s</td>"
|
||||
"</tr>",
|
||||
px->down_trans,
|
||||
px->srv?human_time(be_downtime(px), 1):" ");
|
||||
} else {
|
||||
chunk_printf(&msg, sizeof(trash),
|
||||
/* pxid, name */
|
||||
@ -880,14 +895,15 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
",%d,%d,"
|
||||
/* warnings: retries, redispatches */
|
||||
"%d,%d,"
|
||||
/* server status : reflect backend status (up/down) : we display UP
|
||||
/* backend status: reflect backend status (up/down): we display UP
|
||||
* if the backend has known working servers or if it has no server at
|
||||
* all (eg: for stats). Tthen we display the total weight, number of
|
||||
* all (eg: for stats). Then we display the total weight, number of
|
||||
* active and backups. */
|
||||
"%s,"
|
||||
"%d,%d,%d,"
|
||||
/* rest of server: nothing */
|
||||
",,"
|
||||
/* rest of backend: nothing, down transformations,
|
||||
* last change, total downtime. */
|
||||
",%d,%d,%d,"
|
||||
"\n",
|
||||
px->id,
|
||||
px->nbpend /* or px->totpend ? */, px->nbpend_max,
|
||||
@ -897,7 +913,9 @@ int stats_dump_proxy(struct session *s, struct proxy *px, struct uri_auth *uri,
|
||||
px->failed_conns, px->failed_resp,
|
||||
px->retries, px->redispatches,
|
||||
(px->srv_map_sz > 0 || !px->srv) ? "UP" : "DOWN",
|
||||
px->srv_map_sz * gcd, px->srv_act, px->srv_bck);
|
||||
px->srv_map_sz * gcd, px->srv_act, px->srv_bck,
|
||||
px->down_trans, now.tv_sec - px->last_change,
|
||||
px->srv?be_downtime(px):0);
|
||||
}
|
||||
if (buffer_write_chunk(rep, &msg) != 0)
|
||||
return 0;
|
||||
|
@ -11,10 +11,19 @@
|
||||
*/
|
||||
|
||||
#include <common/config.h>
|
||||
#include <common/time.h>
|
||||
|
||||
#include <types/backend.h>
|
||||
#include <types/proxy.h>
|
||||
#include <types/server.h>
|
||||
|
||||
int srv_downtime(struct server *s) {
|
||||
|
||||
if ((s->state & SRV_RUNNING) && s->last_change < now.tv_sec) // ignore negative time
|
||||
return s->down_time;
|
||||
|
||||
return now.tv_sec - s->last_change + s->down_time;
|
||||
}
|
||||
|
||||
/*
|
||||
* Local variables:
|
||||
|
34
src/time.c
34
src/time.c
@ -142,6 +142,40 @@ REGPRM2 int _tv_isgt(const struct timeval *tv1, const struct timeval *tv2)
|
||||
return __tv_isgt(tv1, tv2);
|
||||
}
|
||||
|
||||
char *human_time(int t, short hz) {
|
||||
static char rv[sizeof("24855d23h")+1]; // longest of "23h59m" and "59m59s"
|
||||
char *p = rv;
|
||||
int cnt=2; // print two numbers
|
||||
|
||||
if (unlikely(t < 0 || hz <= 0)) {
|
||||
sprintf(p, "?");
|
||||
return rv;
|
||||
}
|
||||
|
||||
if (unlikely(hz > 1))
|
||||
t /= hz;
|
||||
|
||||
if (t >= DAY) {
|
||||
p += sprintf(p, "%dd", t / DAY);
|
||||
cnt--;
|
||||
}
|
||||
|
||||
if (cnt && t % DAY / HOUR) {
|
||||
p += sprintf(p, "%dh", t % DAY / HOUR);
|
||||
cnt--;
|
||||
}
|
||||
|
||||
if (cnt && t % HOUR / MINUTE) {
|
||||
p += sprintf(p, "%dm", t % HOUR / MINUTE);
|
||||
cnt--;
|
||||
}
|
||||
|
||||
if ((cnt && t % MINUTE) || !t) // also display '0s'
|
||||
p += sprintf(p, "%ds", t % MINUTE / SEC);
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
/*
|
||||
* Local variables:
|
||||
* c-indent-level: 8
|
||||
|
Loading…
x
Reference in New Issue
Block a user