Merge pull request #4362 from poettering/journalbootlistfix

2024-12-25 23:21:33 +03:00 · 2016-10-13 07:42:39 -04:00 · 2016-10-13 07:42:39 -04:00 · c1a9199ec4
commit c1a9199ec4
parent 6612379adf da597d2b52
6 changed files with 172 additions and 73 deletions
--- a/9
+++ b/9
@ -40,14 +40,6 @@ Features:

 * switch to ProtectSystem=strict for all our long-running services where that's possible

-* introduce an "invocation ID" for units, that is randomly generated, and
-  identifies each runtime-cycle of a unit. It should be set freshly each time
-  we traverse inactive → activating/active, and should be the primary key to
-  map offline data (stored in the journal) with online bus objects. Let's pass
-  this as $SYSTEMD_INVOCATION_ID to services, as well as set this as xattr on
-  the cgroup of a services. The former is accessible without privileges, the
-  latter ensures the ID cannot be faked.
-
 * If RootDirectory= is used, mount /proc, /sys, /dev into it, if not mounted yet

 * Permit masking specific netlink APIs with RestrictAddressFamily=
@ -498,7 +490,6 @@ Features:
    message that works, but alraedy after a short tiemout
  - check if we can make journalctl by default use --follow mode inside of less if called without args?
  - maybe add API to send pairs of iovecs via sd_journal_send
-  - journal: when writing journal auto-rotate if time jumps backwards
  - journal: add a setgid "systemd-journal" utility to invoke from libsystemd-journal, which passes fds via STDOUT and does PK access
  - journactl: support negative filtering, i.e. FOOBAR!="waldo",
    and !FOOBAR for events without FOOBAR.
--- a/src/journal/journal-file.c
+++ b/src/journal/journal-file.c
@ -568,8 +568,8 @@ static int journal_file_verify_header(JournalFile *f) {
                return -ENODATA;

        if (f->writable) {
-                uint8_t state;
                sd_id128_t machine_id;
+                uint8_t state;
                int r;

                r = sd_id128_get_machine(&machine_id);
@ -590,6 +590,14 @@ static int journal_file_verify_header(JournalFile *f) {
                        log_debug("Journal file %s has unknown state %i.", f->path, state);
                        return -EBUSY;
                }
+
+                /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
+                 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
+                 * bisection. */
+                if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
+                        log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
+                        return -ETXTBSY;
+                }
        }

        f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
@ -747,12 +755,16 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
        assert(ret);

        /* Objects may only be located at multiple of 64 bit */
-        if (!VALID64(offset))
+        if (!VALID64(offset)) {
+                log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
                return -EBADMSG;
+        }

        /* Object may not be located in the file header */
-        if (offset < le64toh(f->header->header_size))
+        if (offset < le64toh(f->header->header_size)) {
+                log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
                return -EBADMSG;
+        }

        r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
        if (r < 0)
@ -761,17 +773,29 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset
        o = (Object*) t;
        s = le64toh(o->object.size);

-        if (s < sizeof(ObjectHeader))
+        if (s == 0) {
+                log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
                return -EBADMSG;
+        }
+        if (s < sizeof(ObjectHeader)) {
+                log_debug("Attempt to move to overly short object: %" PRIu64, offset);
+                return -EBADMSG;
+        }

-        if (o->object.type <= OBJECT_UNUSED)
+        if (o->object.type <= OBJECT_UNUSED) {
+                log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
                return -EBADMSG;
+        }

-        if (s < minimum_header_size(o))
+        if (s < minimum_header_size(o)) {
+                log_debug("Attempt to move to truncated object: %" PRIu64, offset);
                return -EBADMSG;
+        }

-        if (type > OBJECT_UNUSED && o->object.type != type)
+        if (type > OBJECT_UNUSED && o->object.type != type) {
+                log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
                return -EBADMSG;
+        }

        if (s > sizeof(ObjectHeader)) {
                r = journal_file_move_to(f, type, false, offset, s, &t);
@ -2472,6 +2496,37 @@ int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
        return 0;
 }

+static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
+
+        /* Increase or decrease the specified index, in the right direction. */
+
+        if (direction == DIRECTION_DOWN) {
+                if (*i >= n - 1)
+                        return 0;
+
+                (*i) ++;
+        } else {
+                if (*i <= 0)
+                        return 0;
+
+                (*i) --;
+        }
+
+        return 1;
+}
+
+static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
+
+        /* Consider it an error if any of the two offsets is uninitialized */
+        if (old_offset == 0 || new_offset == 0)
+                return false;
+
+        /* If we go down, the new offset must be larger than the old one. */
+        return direction == DIRECTION_DOWN ?
+                new_offset > old_offset  :
+                new_offset < old_offset;
+}
+
 int journal_file_next_entry(
                JournalFile *f,
                uint64_t p,
@ -2502,36 +2557,34 @@ int journal_file_next_entry(
                if (r <= 0)
                        return r;

-                if (direction == DIRECTION_DOWN) {
-                        if (i >= n - 1)
-                                return 0;
-
-                        i++;
-                } else {
-                        if (i <= 0)
-                                return 0;
-
-                        i--;
-                }
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
        }

        /* And jump to it */
-        r = generic_array_get(f,
-                              le64toh(f->header->entry_array_offset),
-                              i,
-                              ret, &ofs);
-        if (r == -EBADMSG && direction == DIRECTION_DOWN) {
-                /* Special case: when we iterate throught the journal file linearly, and hit an entry we can't read,
-                 * consider this the end of the journal file. */
-                log_debug_errno(r, "Encountered entry we can't read while iterating through journal file. Considering this the end of the file.");
-                return 0;
-        }
-        if (r <= 0)
-                return r;
+        for (;;) {
+                r = generic_array_get(f,
+                                      le64toh(f->header->entry_array_offset),
+                                      i,
+                                      ret, &ofs);
+                if (r > 0)
+                        break;
+                if (r != -EBADMSG)
+                        return r;

-        if (p > 0 &&
-            (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
-                log_debug("%s: entry array corrupted at entry %" PRIu64, f->path, i);
+                /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
+                 * the next one might work for us instead. */
+                log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
+
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
+        }
+
+        /* Ensure our array is properly ordered. */
+        if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
+                log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
                return -EBADMSG;
        }

@ -2548,9 +2601,9 @@ int journal_file_next_entry_for_data(
                direction_t direction,
                Object **ret, uint64_t *offset) {

-        uint64_t n, i;
-        int r;
+        uint64_t i, n, ofs;
        Object *d;
+        int r;

        assert(f);
        assert(p > 0 || !o);
@ -2582,25 +2635,39 @@ int journal_file_next_entry_for_data(
                if (r <= 0)
                        return r;

-                if (direction == DIRECTION_DOWN) {
-                        if (i >= n - 1)
-                                return 0;
-
-                        i++;
-                } else {
-                        if (i <= 0)
-                                return 0;
-
-                        i--;
-                }
-
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
        }

-        return generic_array_get_plus_one(f,
-                                          le64toh(d->data.entry_offset),
-                                          le64toh(d->data.entry_array_offset),
-                                          i,
-                                          ret, offset);
+        for (;;) {
+                r = generic_array_get_plus_one(f,
+                                               le64toh(d->data.entry_offset),
+                                               le64toh(d->data.entry_array_offset),
+                                               i,
+                                               ret, &ofs);
+                if (r > 0)
+                        break;
+                if (r != -EBADMSG)
+                        return r;
+
+                log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
+
+                r = bump_array_index(&i, direction, n);
+                if (r <= 0)
+                        return r;
+        }
+
+        /* Ensure our array is properly ordered. */
+        if (p > 0 && check_properly_ordered(ofs, p, direction)) {
+                log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
+                return -EBADMSG;
+        }
+
+        if (offset)
+                *offset = ofs;
+
+        return 1;
 }

 int journal_file_move_to_entry_by_offset_for_data(
@ -3271,7 +3338,8 @@ int journal_file_open_reliably(
                    -EBUSY,             /* unclean shutdown */
                    -ESHUTDOWN,         /* already archived */
                    -EIO,               /* IO error, including SIGBUS on mmap */
-                    -EIDRM              /* File has been deleted */))
+                    -EIDRM,             /* File has been deleted */
+                    -ETXTBSY))          /* File is from the future */
                return r;

        if ((flags & O_ACCMODE) == O_RDONLY)
--- a/src/journal/journal-vacuum.c
+++ b/src/journal/journal-vacuum.c
@ -343,7 +343,7 @@ finish:
                free(list[i].filename);
        free(list);

-        log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals on disk.", format_bytes(sbytes, sizeof(sbytes), freed));
+        log_full(verbose ? LOG_INFO : LOG_DEBUG, "Vacuuming done, freed %s of archived journals from %s.", format_bytes(sbytes, sizeof(sbytes), freed), directory);

        return r;
 }
--- a/src/journal/journalctl.c
+++ b/src/journal/journalctl.c
@ -1091,8 +1091,10 @@ static int discover_next_boot(sd_journal *j,
                r = sd_journal_previous(j);
        if (r < 0)
                return r;
-        else if (r == 0)
+        else if (r == 0) {
+                log_debug("Whoopsie! We found a boot ID but can't read its last entry.");
                return -ENODATA; /* This shouldn't happen. We just came from this very boot ID. */
+        }

        r = sd_journal_get_realtime_usec(j, &next_boot->last);
        if (r < 0)
@ -2264,7 +2266,7 @@ int main(int argc, char *argv[]) {
                if (r < 0)
                        goto finish;

-                printf("Archived and active journals take up %s on disk.\n",
+                printf("Archived and active journals take up %s in the file system.\n",
                       format_bytes(sbytes, sizeof(sbytes), bytes));
                goto finish;
        }
--- a/src/journal/journald-server.c
+++ b/src/journal/journald-server.c
@ -595,52 +595,86 @@ static void server_cache_hostname(Server *s) {

 static bool shall_try_append_again(JournalFile *f, int r) {
        switch(r) {
+
        case -E2BIG:           /* Hit configured limit          */
        case -EFBIG:           /* Hit fs limit                  */
        case -EDQUOT:          /* Quota limit hit               */
        case -ENOSPC:          /* Disk full                     */
                log_debug("%s: Allocation limit reached, rotating.", f->path);
                return true;
+
        case -EIO:             /* I/O error of some kind (mmap) */
                log_warning("%s: IO error, rotating.", f->path);
                return true;
+
        case -EHOSTDOWN:       /* Other machine                 */
                log_info("%s: Journal file from other machine, rotating.", f->path);
                return true;
+
        case -EBUSY:           /* Unclean shutdown              */
                log_info("%s: Unclean shutdown, rotating.", f->path);
                return true;
+
        case -EPROTONOSUPPORT: /* Unsupported feature           */
                log_info("%s: Unsupported feature, rotating.", f->path);
                return true;
+
        case -EBADMSG:         /* Corrupted                     */
        case -ENODATA:         /* Truncated                     */
        case -ESHUTDOWN:       /* Already archived              */
                log_warning("%s: Journal file corrupted, rotating.", f->path);
                return true;
+
        case -EIDRM:           /* Journal file has been deleted */
                log_warning("%s: Journal file has been deleted, rotating.", f->path);
                return true;
+
+        case -ETXTBSY:         /* Journal file is from the future */
+                log_warning("%s: Journal file is from the future, rotating.", f->path);
+                return true;
+
        default:
                return false;
        }
 }

 static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned n, int priority) {
+        bool vacuumed = false, rotate = false;
+        struct dual_timestamp ts;
        JournalFile *f;
-        bool vacuumed = false;
        int r;

        assert(s);
        assert(iovec);
        assert(n > 0);

-        f = find_journal(s, uid);
-        if (!f)
-                return;
+        /* Get the closest, linearized time we have for this log event from the event loop. (Note that we do not use
+         * the source time, and not even the time the event was originally seen, but instead simply the time we started
+         * processing it, as we want strictly linear ordering in what we write out.) */
+        assert_se(sd_event_now(s->event, CLOCK_REALTIME, &ts.realtime) >= 0);
+        assert_se(sd_event_now(s->event, CLOCK_MONOTONIC, &ts.monotonic) >= 0);

-        if (journal_file_rotate_suggested(f, s->max_file_usec)) {
-                log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path);
+        if (ts.realtime < s->last_realtime_clock) {
+                /* When the time jumps backwards, let's immediately rotate. Of course, this should not happen during
+                 * regular operation. However, when it does happen, then we should make sure that we start fresh files
+                 * to ensure that the entries in the journal files are strictly ordered by time, in order to ensure
+                 * bisection works correctly. */
+
+                log_debug("Time jumped backwards, rotating.");
+                rotate = true;
+        } else {
+
+                f = find_journal(s, uid);
+                if (!f)
+                        return;
+
+                if (journal_file_rotate_suggested(f, s->max_file_usec)) {
+                        log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path);
+                        rotate = true;
+                }
+        }
+
+        if (rotate) {
                server_rotate(s);
                server_vacuum(s, false, false);
                vacuumed = true;
@ -650,7 +684,9 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned
                        return;
        }

-        r = journal_file_append_entry(f, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        s->last_realtime_clock = ts.realtime;
+
+        r = journal_file_append_entry(f, &ts, iovec, n, &s->seqnum, NULL, NULL);
        if (r >= 0) {
                server_schedule_sync(s, priority);
                return;
@ -669,7 +705,7 @@ static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, unsigned
                return;

        log_debug("Retrying write.");
-        r = journal_file_append_entry(f, NULL, iovec, n, &s->seqnum, NULL, NULL);
+        r = journal_file_append_entry(f, &ts, iovec, n, &s->seqnum, NULL, NULL);
        if (r < 0)
                log_error_errno(r, "Failed to write entry (%d items, %zu bytes) despite vacuuming, ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n));
        else
--- a/src/journal/journald-server.h
+++ b/src/journal/journald-server.h
@ -149,6 +149,8 @@ struct Server {
        char *cgroup_root;

        usec_t watchdog_usec;
+
+        usec_t last_realtime_clock;
 };

 #define SERVER_MACHINE_ID(s) ((s)->machine_id_field + strlen("_MACHINE_ID="))