From d79ca7a622abbb0df6f5166cc0e4669373d9a614 Mon Sep 17 00:00:00 2001 From: Klearchos Chaloulos Date: Tue, 5 Apr 2016 13:47:04 +0300 Subject: [PATCH] journal-upload: Update watchdog while in curl_easy_perform It is observed that a combination of high log throughput, low I/O speed on journal remote side and many nodes uploading simultaneously caused the journal-upload process to dump core because of watchdog starvation. This is caused because journal-upload stays in curl_easy_perform(), because it cannot upload fast enough to reach the end of the journal. Currently journal-upload will return from curl_easy_perform() only when the end of the journal is reached. Therefore a check is added in journal_input_callback(), which will update the watchdog if the elapsed time since the start of the uploading process is greater than WATCHDOG_USEC/2. --- src/journal-remote/journal-upload-journal.c | 25 +++++++++++++++++++++ src/journal-remote/journal-upload.c | 1 + src/journal-remote/journal-upload.h | 1 + 3 files changed, 27 insertions(+) diff --git a/src/journal-remote/journal-upload-journal.c b/src/journal-remote/journal-upload-journal.c index e61b6bc68f5..ac6eb58a9f4 100644 --- a/src/journal-remote/journal-upload-journal.c +++ b/src/journal-remote/journal-upload-journal.c @@ -25,6 +25,7 @@ #include "log.h" #include "utf8.h" #include "util.h" +#include "sd-daemon.h" /** * Write up to size bytes to buf. Return negative on error, and number of @@ -242,6 +243,28 @@ static ssize_t write_entry(char *buf, size_t size, Uploader *u) { assert_not_reached("WTF?"); } +static inline void check_update_watchdog(Uploader *u) { + usec_t watchdog_usec; + static usec_t before; + usec_t after; + usec_t elapsed_time; + + if (sd_watchdog_enabled(false, &watchdog_usec) < 0) + return; + if (u->reset_reference_timestamp) { + before = now(CLOCK_MONOTONIC); + u->reset_reference_timestamp = false; + } else { + after = now(CLOCK_MONOTONIC); + elapsed_time = usec_sub(after, before); + if (elapsed_time > watchdog_usec / 2) { + log_debug("Update watchdog timer"); + sd_notify(false, "WATCHDOG=1"); + u->reset_reference_timestamp = true; + } + } +} + static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void *userp) { Uploader *u = userp; int r; @@ -252,6 +275,8 @@ static size_t journal_input_callback(void *buf, size_t size, size_t nmemb, void assert(u); assert(nmemb <= SSIZE_MAX / size); + check_update_watchdog(u); + j = u->journal; while (j && filled < size * nmemb) { diff --git a/src/journal-remote/journal-upload.c b/src/journal-remote/journal-upload.c index 6e1c3bb9ef4..f2e9117f9f7 100644 --- a/src/journal-remote/journal-upload.c +++ b/src/journal-remote/journal-upload.c @@ -494,6 +494,7 @@ static int perform_upload(Uploader *u) { assert(u); + u->reset_reference_timestamp = true; code = curl_easy_perform(u->easy); if (code) { if (u->error[0]) diff --git a/src/journal-remote/journal-upload.h b/src/journal-remote/journal-upload.h index b8cd04d5275..a31735bd083 100644 --- a/src/journal-remote/journal-upload.h +++ b/src/journal-remote/journal-upload.h @@ -48,6 +48,7 @@ typedef struct Uploader { size_t entries_sent; char *last_cursor, *current_cursor; + bool reset_reference_timestamp; } Uploader; #define JOURNAL_UPLOAD_POLL_TIMEOUT (10 * USEC_PER_SEC)