Merge pull request #26959 from poettering/creds-mount-dep-fix

credential ramfs mount order fixes
2025-03-09 12:58:26 +03:00 · 2023-05-25 02:06:47 -07:00 · 2023-05-25 02:06:47 -07:00 · 0f50da0f6b
commit 0f50da0f6b
parent 21392bcb1c 00078fb309
12 changed files with 245 additions and 49 deletions
--- a/man/rules/meson.build
+++ b/man/rules/meson.build
@ -633,6 +633,7 @@ manpages = [
  '3',
  ['sd_event_source_get_ratelimit',
   'sd_event_source_is_ratelimited',
+   'sd_event_source_leave_ratelimit',
   'sd_event_source_set_ratelimit_expire_callback'],
  ''],
 ['sd_event_source_set_userdata', '3', ['sd_event_source_get_userdata'], ''],
--- a/man/sd_event_source_set_ratelimit.xml
+++ b/man/sd_event_source_set_ratelimit.xml
@ -20,6 +20,7 @@
    <refname>sd_event_source_get_ratelimit</refname>
    <refname>sd_event_source_is_ratelimited</refname>
    <refname>sd_event_source_set_ratelimit_expire_callback</refname>
+    <refname>sd_event_source_leave_ratelimit</refname>

    <refpurpose>Configure rate limiting on event sources</refpurpose>
  </refnamediv>
@ -53,6 +54,10 @@
        <paramdef>sd_event_handler_t<parameter>callback</parameter></paramdef>
      </funcprototype>

+      <funcprototype>
+        <funcdef>int <function>sd_event_source_leave_ratelimit</function></funcdef>
+        <paramdef>sd_event_source *<parameter>source</parameter></paramdef>
+      </funcprototype>
    </funcsynopsis>
  </refsynopsisdiv>

@ -85,10 +90,14 @@
    is currently affected by rate limiting, i.e. it has recently hit the rate limit and is currently
    temporarily disabled due to that.</para>

-    <para><function>sd_event_source_set_ratelimit_expire_callback</function> may be used to set a callback
+    <para><function>sd_event_source_set_ratelimit_expire_callback()</function> may be used to set a callback
    function that is invoked every time the event source leaves rate limited state. Note that function is
    called in the same event loop iteration in which state transition occurred.</para>

+    <para><function>sd_event_source_leave_ratelimit()</function> may be used to immediately reenable an event
+    source that was temporarily disabled due to rate limiting. This will reset the ratelimit counters for the
+    current time interval.</para>
+
    <para>Rate limiting is currently implemented for I/O, timer, signal, defer and inotify event
    sources.</para>
  </refsect1>
@ -98,10 +107,12 @@

    <para>On success, <function>sd_event_source_set_ratelimit()</function>,
    <function>sd_event_source_set_ratelimit_expire_callback</function> and
-    <function>sd_event_source_get_ratelimit()</function> return a non-negative integer. On failure, they return
-    a negative errno-style error code. <function>sd_event_source_is_ratelimited</function> returns zero if rate
-    limiting is currently not in effect and greater than zero if it is in effect; it returns a negative
-    errno-style error code on failure.</para>
+    <function>sd_event_source_get_ratelimit()</function> return a non-negative integer. On failure, they
+    return a negative errno-style error code. <function>sd_event_source_is_ratelimited()</function> returns
+    zero if rate limiting is currently not in effect and greater than zero if it is in effect; it returns a
+    negative errno-style error code on failure. <function>sd_event_source_leave_ratelimit()</function>
+    returns zero if rate limiting wasn't in effect on the specified event source, and positive if it was and
+    rate limiting is now turned off again; it returns a negative errno-style error code on failure.</para>

    <refsect2>
      <title>Errors</title>
--- a/src/basic/unit-def.c
+++ b/src/basic/unit-def.c
@ -152,6 +152,7 @@ static const char* const mount_state_table[_MOUNT_STATE_MAX] = {
        [MOUNT_REMOUNTING_SIGKILL] = "remounting-sigkill",
        [MOUNT_UNMOUNTING_SIGTERM] = "unmounting-sigterm",
        [MOUNT_UNMOUNTING_SIGKILL] = "unmounting-sigkill",
+        [MOUNT_UNMOUNTING_CATCHUP] = "unmounting-catchup",
        [MOUNT_FAILED]             = "failed",
        [MOUNT_CLEANING]           = "cleaning",
 };
--- a/src/basic/unit-def.h
+++ b/src/basic/unit-def.h
@ -97,6 +97,7 @@ typedef enum MountState {
        MOUNT_REMOUNTING_SIGKILL,
        MOUNT_UNMOUNTING_SIGTERM,
        MOUNT_UNMOUNTING_SIGKILL,
+        MOUNT_UNMOUNTING_CATCHUP,
        MOUNT_FAILED,
        MOUNT_CLEANING,
        _MOUNT_STATE_MAX,
--- a/src/core/execute.c
+++ b/src/core/execute.c
@ -1539,7 +1539,7 @@ static bool context_has_no_new_privileges(const ExecContext *c) {
                context_has_syscall_logs(c);
 }

-static bool exec_context_has_credentials(const ExecContext *context) {
+bool exec_context_has_credentials(const ExecContext *context) {

        assert(context);

@ -2787,7 +2787,7 @@ static char **credential_search_path(
        if (DEBUG_LOGGING) {
                _cleanup_free_ char *t = strv_join(l, ":");

-                log_debug("Credential search path is: %s", t);
+                log_debug("Credential search path is: %s", strempty(t));
        }

        return TAKE_PTR(l);
--- a/src/core/execute.h
+++ b/src/core/execute.h
@ -476,6 +476,7 @@ const char* exec_context_fdname(const ExecContext *c, int fd_index);
 bool exec_context_may_touch_console(const ExecContext *c);
 bool exec_context_maintains_privileges(const ExecContext *c);
 bool exec_context_has_encrypted_credentials(ExecContext *c);
+bool exec_context_has_credentials(const ExecContext *context);

 int exec_context_get_effective_ioprio(const ExecContext *c);
 bool exec_context_get_effective_mount_apivfs(const ExecContext *c);
--- a/src/core/mount.c
+++ b/src/core/mount.c
@ -48,6 +48,7 @@ static const UnitActiveState state_translation_table[_MOUNT_STATE_MAX] = {
        [MOUNT_REMOUNTING_SIGKILL] = UNIT_RELOADING,
        [MOUNT_UNMOUNTING_SIGTERM] = UNIT_DEACTIVATING,
        [MOUNT_UNMOUNTING_SIGKILL] = UNIT_DEACTIVATING,
+        [MOUNT_UNMOUNTING_CATCHUP] = UNIT_DEACTIVATING,
        [MOUNT_FAILED] = UNIT_FAILED,
        [MOUNT_CLEANING] = UNIT_MAINTENANCE,
 };
@ -473,6 +474,22 @@ static bool mount_is_extrinsic(Unit *u) {
        return false;
 }

+static bool mount_is_credentials(Mount *m) {
+        const char *e;
+
+        assert(m);
+
+        /* Returns true if this is a credentials mount. We don't want automatic dependencies on credential
+         * mounts, since they are managed by us for even the earliest services, and we never want anything to
+         * be ordered before them hence. */
+
+        e = path_startswith(m->where, UNIT(m)->manager->prefix[EXEC_DIRECTORY_RUNTIME]);
+        if (!e)
+                return false;
+
+        return !isempty(path_startswith(e, "credentials"));
+}
+
 static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) {
        const char *after, *before, *e;
        int r;
@ -495,7 +512,10 @@ static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p,
                after = SPECIAL_LOCAL_FS_PRE_TARGET;
                before = SPECIAL_INITRD_USR_FS_TARGET;

-        } else if (mount_is_network(p)) {
+        } else if (mount_is_credentials(m))
+                after = before = NULL;
+
+        else if (mount_is_network(p)) {
                after = SPECIAL_REMOTE_FS_PRE_TARGET;
                before = SPECIAL_REMOTE_FS_TARGET;

@ -504,17 +524,56 @@ static int mount_add_default_ordering_dependencies(Mount *m, MountParameters *p,
                before = SPECIAL_LOCAL_FS_TARGET;
        }

-        if (!mount_is_nofail(m)) {
+        if (before && !mount_is_nofail(m)) {
                r = unit_add_dependency_by_name(UNIT(m), UNIT_BEFORE, before, /* add_reference= */ true, mask);
                if (r < 0)
                        return r;
        }

+        if (after) {
                r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, after, /* add_reference= */ true, mask);
                if (r < 0)
                        return r;
+        }

-        return unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET,
+        r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_BEFORE, UNIT_CONFLICTS, SPECIAL_UMOUNT_TARGET,
+                                              /* add_reference= */ true, mask);
+        if (r < 0)
+                return r;
+
+        /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */
+        if (streq_ptr(p->fstype, "tmpfs") && !mount_is_credentials(m)) {
+                r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET,
+                                                /* add_reference= */ true, mask);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+static int mount_add_default_network_dependencies(Mount *m, MountParameters *p, UnitDependencyMask mask) {
+        int r;
+
+        assert(m);
+
+        if (!mount_is_network(p))
+                return 0;
+
+        /* We order ourselves after network.target. This is primarily useful at shutdown: services that take
+         * down the network should order themselves before network.target, so that they are shut down only
+         * after this mount unit is stopped. */
+
+        r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET,
+                                        /* add_reference= */ true, mask);
+        if (r < 0)
+                return r;
+
+        /* We pull in network-online.target, and order ourselves after it. This is useful at start-up to
+         * actively pull in tools that want to be started before we start mounting network file systems, and
+         * whose purpose it is to delay this until the network is "up". */
+
+        return unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET,
                                                 /* add_reference= */ true, mask);
 }

@ -545,36 +604,10 @@ static int mount_add_default_dependencies(Mount *m) {
        if (r < 0)
                return r;

-        if (mount_is_network(p)) {
-                /* We order ourselves after network.target. This is primarily useful at shutdown:
-                 * services that take down the network should order themselves before
-                 * network.target, so that they are shut down only after this mount unit is
-                 * stopped. */
-
-                r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_NETWORK_TARGET,
-                                                /* add_reference= */ true, mask);
+        r = mount_add_default_network_dependencies(m, p, mask);
        if (r < 0)
                return r;

-                /* We pull in network-online.target, and order ourselves after it. This is useful
-                 * at start-up to actively pull in tools that want to be started before we start
-                 * mounting network file systems, and whose purpose it is to delay this until the
-                 * network is "up". */
-
-                r = unit_add_two_dependencies_by_name(UNIT(m), UNIT_WANTS, UNIT_AFTER, SPECIAL_NETWORK_ONLINE_TARGET,
-                                                      /* add_reference= */ true, mask);
-                if (r < 0)
-                        return r;
-        }
-
-        /* If this is a tmpfs mount then we have to unmount it before we try to deactivate swaps */
-        if (streq_ptr(p->fstype, "tmpfs")) {
-                r = unit_add_dependency_by_name(UNIT(m), UNIT_AFTER, SPECIAL_SWAP_TARGET,
-                                                /* add_reference= */ true, mask);
-                if (r < 0)
-                        return r;
-        }
-
        return exec_context_add_default_dependencies(UNIT(m), &m->exec_context);
 }

@ -1037,11 +1070,37 @@ static void mount_enter_unmounting(Mount *m) {

        assert(m);

+        r = path_is_mount_point(m->where, NULL, 0);
+        if (IN_SET(r, 0, -ENOENT)) {
+                /* Not a mount point anymore ? Then we raced against something else which unmounted it? Let's
+                 * handle this gracefully, and wait until we get the kernel notification about it. */
+
+                log_unit_debug(UNIT(m), "Path '%s' is not a mount point anymore, waiting for mount table refresh.", m->where);
+
+                /* Apparently our idea of the kernel mount table is out of date. Make sure we re-read it
+                 * again, soon, so that we don't delay mount handling unnecessarily. */
+                (void) sd_event_source_leave_ratelimit(UNIT(m)->manager->mount_event_source);
+
+                m->control_command_id = _MOUNT_EXEC_COMMAND_INVALID;
+                mount_unwatch_control_pid(m);
+
+                r = mount_arm_timer(m, usec_add(now(CLOCK_MONOTONIC), m->timeout_usec));
+                if (r < 0)
+                        goto fail;
+
+                /* Let's enter a distinct state where we just wait for the mount table to catch up */
+                mount_set_state(m, MOUNT_UNMOUNTING_CATCHUP);
+                return;
+        }
+        if (r < 0)
+                log_unit_debug_errno(UNIT(m), r, "Unable to determine if '%s' is a mount point, ignoring: %m", m->where);
+
        /* Start counting our attempts */
        if (!IN_SET(m->state,
                    MOUNT_UNMOUNTING,
                    MOUNT_UNMOUNTING_SIGTERM,
-                    MOUNT_UNMOUNTING_SIGKILL))
+                    MOUNT_UNMOUNTING_SIGKILL,
+                    MOUNT_UNMOUNTING_CATCHUP))
                m->n_retry_umount = 0;

        m->control_command_id = MOUNT_EXEC_UNMOUNT;
@ -1062,7 +1121,6 @@ static void mount_enter_unmounting(Mount *m) {
                goto fail;

        mount_set_state(m, MOUNT_UNMOUNTING);
-
        return;

 fail:
@ -1235,6 +1293,7 @@ static int mount_start(Unit *u) {
                   MOUNT_UNMOUNTING,
                   MOUNT_UNMOUNTING_SIGTERM,
                   MOUNT_UNMOUNTING_SIGKILL,
+                   MOUNT_UNMOUNTING_CATCHUP,
                   MOUNT_CLEANING))
                return -EAGAIN;

@ -1264,6 +1323,7 @@ static int mount_stop(Unit *u) {
        case MOUNT_UNMOUNTING:
        case MOUNT_UNMOUNTING_SIGKILL:
        case MOUNT_UNMOUNTING_SIGTERM:
+        case MOUNT_UNMOUNTING_CATCHUP:
                /* Already on it */
                return 0;

@ -1529,6 +1589,10 @@ static void mount_sigchld_event(Unit *u, pid_t pid, int code, int status) {
                mount_enter_dead_or_mounted(m, f);
                break;

+        case MOUNT_UNMOUNTING_CATCHUP:
+                log_unit_debug(u, "Was notified about control process death, but wasn't expecting it. Ignoring.");
+                break;
+
        case MOUNT_CLEANING:
                if (m->clean_result == MOUNT_SUCCESS)
                        m->clean_result = f;
@ -1603,6 +1667,11 @@ static int mount_dispatch_timer(sd_event_source *source, usec_t usec, void *user
                mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
                break;

+        case MOUNT_UNMOUNTING_CATCHUP:
+                log_unit_warning(UNIT(m), "Waiting for unmount notification timed out. Giving up.");
+                mount_enter_dead_or_mounted(m, MOUNT_FAILURE_TIMEOUT);
+                break;
+
        case MOUNT_CLEANING:
                log_unit_warning(UNIT(m), "Cleaning timed out. killing.");

@ -2044,8 +2113,7 @@ static int mount_process_proc_self_mountinfo(Manager *m) {

                if (!mount_is_mounted(mount)) {

-                        /* A mount point is not around right now. It
-                         * might be gone, or might never have
+                        /* A mount point is not around right now. It might be gone, or might never have
                         * existed. */

                        if (mount->from_proc_self_mountinfo &&
@ -2060,6 +2128,7 @@ static int mount_process_proc_self_mountinfo(Manager *m) {
                        switch (mount->state) {

                        case MOUNT_MOUNTED:
+                        case MOUNT_UNMOUNTING_CATCHUP:
                                /* This has just been unmounted by somebody else, follow the state change. */
                                mount_enter_dead(mount, MOUNT_SUCCESS);
                                break;
@ -2098,11 +2167,9 @@ static int mount_process_proc_self_mountinfo(Manager *m) {
                                break;

                        default:
-                                /* Nothing really changed, but let's
-                                 * issue an notification call
-                                 * nonetheless, in case somebody is
-                                 * waiting for this. (e.g. file system
-                                 * ro/rw remounts.) */
+                                /* Nothing really changed, but let's issue an notification call nonetheless,
+                                 * in case somebody is waiting for this. (e.g. file system ro/rw
+                                 * remounts.) */
                                mount_set_state(mount, mount->state);
                                break;
                        }
--- a/src/core/unit.c
+++ b/src/core/unit.c
@ -1411,6 +1411,26 @@ int unit_add_exec_dependencies(Unit *u, ExecContext *c) {
        if (r < 0)
                return r;

+        if (exec_context_has_credentials(c) && u->manager->prefix[EXEC_DIRECTORY_RUNTIME]) {
+                _cleanup_free_ char *p = NULL, *m = NULL;
+
+                /* Let's make sure the credentials directory of this service is unmounted *after* the service
+                 * itself shuts down. This only matters if mount namespacing is not used for the service, and
+                 * hence the credentials mount appears on the host. */
+
+                p = path_join(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
+                if (!p)
+                        return -ENOMEM;
+
+                r = unit_name_from_path(p, ".mount", &m);
+                if (r < 0)
+                        return r;
+
+                r = unit_add_dependency_by_name(u, UNIT_AFTER, m, /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
+                if (r < 0)
+                        return r;
+        }
+
        return 0;
 }

--- a/src/libsystemd/libsystemd.sym
+++ b/src/libsystemd/libsystemd.sym
@ -824,4 +824,5 @@ global:
        sd_event_source_set_memory_pressure_period;
        sd_event_trim_memory;
        sd_pid_notify_barrier;
+        sd_event_source_leave_ratelimit;
 } LIBSYSTEMD_253;
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@ -5193,6 +5193,27 @@ _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
        return s->ratelimited;
 }

+_public_ int sd_event_source_leave_ratelimit(sd_event_source *s) {
+        int r;
+
+        assert_return(s, -EINVAL);
+
+        if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
+                return 0;
+
+        if (!ratelimit_configured(&s->rate_limit))
+                return 0;
+
+        if (!s->ratelimited)
+                return 0;
+
+        r = event_source_leave_ratelimit(s, /* run_callback */ false);
+        if (r < 0)
+                return r;
+
+        return 1; /* tell caller that we indeed just left the ratelimit state */
+}
+
 _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
        bool change = false;
        int r;
--- a/src/libsystemd/sd-event/test-event.c
+++ b/src/libsystemd/sd-event/test-event.c
@ -828,4 +828,75 @@ TEST(fork) {
        assert_se(r >= 0);
 }

+static int hup_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
+        unsigned *c = userdata;
+
+        assert_se(revents == EPOLLHUP);
+
+        (*c)++;
+        return 0;
+}
+
+TEST(leave_ratelimit) {
+        bool expect_ratelimit = false, manually_left_ratelimit = false;
+        _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
+        _cleanup_(sd_event_unrefp) sd_event *e = NULL;
+        _cleanup_(close_pairp) int pfd[2] = PIPE_EBADF;
+        unsigned c = 0;
+        int r;
+
+        assert_se(sd_event_default(&e) >= 0);
+
+        /* Create an event source that will continously fire by creating a pipe whose write side is closed,
+         * and which hence will only see EOF and constant EPOLLHUP */
+        assert_se(pipe2(pfd, O_CLOEXEC) >= 0);
+        assert_se(sd_event_add_io(e, &s, pfd[0], EPOLLIN, hup_callback, &c) >= 0);
+        assert_se(sd_event_source_set_io_fd_own(s, true) >= 0);
+        assert_se(sd_event_source_set_ratelimit(s, 5*USEC_PER_MINUTE, 5) >= 0);
+
+        pfd[0] = -EBADF;
+        pfd[1] = safe_close(pfd[1]); /* Trigger continous EOF */
+
+        for (;;) {
+                r = sd_event_prepare(e);
+                assert_se(r >= 0);
+
+                if (r == 0) {
+                        r = sd_event_wait(e, UINT64_MAX);
+                        assert_se(r > 0);
+                }
+
+                r = sd_event_dispatch(e);
+                assert_se(r > 0);
+
+                r = sd_event_source_is_ratelimited(s);
+                assert_se(r >= 0);
+
+                if (c < 5)
+                        /* First four dispatches should just work */
+                        assert_se(!r);
+                else if (c == 5) {
+                        /* The fifth dispatch should still work, but we now expect the ratelimit to be hit subsequently */
+                        if (!expect_ratelimit) {
+                                assert_se(!r);
+                                assert_se(sd_event_source_leave_ratelimit(s) == 0); /* this should be a NOP, and return 0 hence */
+                                expect_ratelimit = true;
+                        } else {
+                                /* We expected the ratelimit, let's leave it manually, and verify it */
+                                assert_se(r);
+                                assert_se(sd_event_source_leave_ratelimit(s) > 0); /* we are ratelimited, hence should return > 0 */
+                                assert_se(sd_event_source_is_ratelimited(s) == 0);
+
+                                manually_left_ratelimit = true;
+                        }
+
+                } else if (c == 6)
+                        /* On the sixth iteration let's just exit */
+                        break;
+        }
+
+        /* Verify we definitely hit the ratelimit and left it manually again */
+        assert_se(manually_left_ratelimit);
+}
+
 DEFINE_TEST_MAIN(LOG_DEBUG);
--- a/src/systemd/sd-event.h
+++ b/src/systemd/sd-event.h
@ -173,6 +173,7 @@ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval_usec, un
 int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval_usec, unsigned *ret_burst);
 int sd_event_source_is_ratelimited(sd_event_source *s);
 int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback);
+int sd_event_source_leave_ratelimit(sd_event_source *s);

 int sd_event_trim_memory(void);