1
0
mirror of https://github.com/systemd/systemd.git synced 2024-12-25 01:34:28 +03:00

journal-upload: Update watchdog while in curl_easy_perform

It is observed that a combination of high log throughput, low I/O speed on journal remote side and many nodes uploading simultaneously caused the journal-upload process to dump core because of watchdog starvation. This is caused because journal-upload stays in curl_easy_perform(), because it cannot upload fast enough to reach the end of the journal. Currently journal-upload will return from curl_easy_perform() only when the end of the journal is reached. Therefore a check is added in journal_input_callback(), which will update the watchdog if the elapsed time since the start of the uploading process is greater than WATCHDOG_USEC/2.
This commit is contained in:
Klearchos Chaloulos 2016-04-05 13:47:04 +03:00
parent 050d7e1998
commit d79ca7a622
3 changed files with 27 additions and 0 deletions

View File

@ -25,6 +25,7 @@
#include "log.h"
#include "utf8.h"
#include "util.h"
#include "sd-daemon.h"
/**
* Write up to size bytes to buf. Return negative on error, and number of
@ -242,6 +243,28 @@ static ssize_t write_entry(char *buf, size_t size, Uploader *u) {
assert_not_reached("WTF?");
}
static inline void check_update_watchdog(Uploader *u) {
usec_t watchdog_usec;
static usec_t before;
usec_t after;
usec_t elapsed_time;
if (sd_watchdog_enabled(false, &watchdog_usec) < 0)
return;
if (u->reset_reference_timestamp) {
before = now(CLOCK_MONOTONIC);
u->reset_reference_timestamp = false;
} else {
after = now(CLOCK_MONOTONIC);
elapsed_time = usec_sub(after, before);
if (elapsed_time > watchdog_usec / 2) {
log_debug("Update watchdog timer");
sd_notify(false, "WATCHDOG=1");
u->reset_reference_timestamp = true;
}
}
}
static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void *userp) {
Uploader *u = userp;
int r;
@ -252,6 +275,8 @@ static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void
assert(u);
assert(nmemb <= SSIZE_MAX / size);
check_update_watchdog(u);
j = u->journal;
while (j && filled < size * nmemb) {

View File

@ -494,6 +494,7 @@ static int perform_upload(Uploader *u) {
assert(u);
u->reset_reference_timestamp = true;
code = curl_easy_perform(u->easy);
if (code) {
if (u->error[0])

View File

@ -48,6 +48,7 @@ typedef struct Uploader {
size_t entries_sent;
char *last_cursor, *current_cursor;
bool reset_reference_timestamp;
} Uploader;
#define JOURNAL_UPLOAD_POLL_TIMEOUT (10 * USEC_PER_SEC)