Merge pull request #20344 from poettering/revert-close-all

Alternative to #20288 – close_all_fds() reworking
2025-03-07 04:58:29 +03:00 · 2021-10-27 22:02:38 +02:00 · 2021-10-27 22:02:38 +02:00 · d5bb2b0375
commit d5bb2b0375
parent 3dea470121 b689197241
8 changed files with 310 additions and 116 deletions
--- a/src/basic/fd-util.c
+++ b/src/basic/fd-util.c
@ -187,7 +187,7 @@ _pure_ static bool fd_in_set(int fd, const int fdset[], size_t n_fdset) {
        return false;
 }

-static int get_max_fd(void) {
+int get_max_fd(void) {
        struct rlimit rl;
        rlim_t m;

@ -208,104 +208,17 @@ static int get_max_fd(void) {
        return (int) (m - 1);
 }

-int close_all_fds_full(int except[], size_t n_except, bool allow_alloc) {
-        static bool have_close_range = true; /* Assume we live in the future */
-        _cleanup_closedir_ DIR *d = NULL;
-        int r = 0;
+static int close_all_fds_frugal(const int except[], size_t n_except) {
+        int max_fd, r = 0;

        assert(n_except == 0 || except);

-        if (have_close_range) {
-                /* In the best case we have close_range() to close all fds between a start and an end fd,
-                 * which we can use on the "inverted" exception array, i.e. all intervals between all
-                 * adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
-                 * where n is number of open fds to O(m⋅log(m)) where m is the number of fds to keep
-                 * open. Given that we assume n ≫ m that's preferable to us. */
+        /* This is the inner fallback core of close_all_fds(). This never calls malloc() or opendir() or so
+         * and hence is safe to be called in signal handler context. Most users should call close_all_fds(),
+         * but when we assume we are called from signal handler context, then use this simpler call
+         * instead. */

-                if (n_except == 0) {
-                        /* Close everything. Yay! */
-
-                        if (close_range(3, -1, 0) >= 0)
-                                return 0;
-
-                        if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
-                                have_close_range = false;
-                        else
-                                return -errno;
-
-                } else {
-                        typesafe_qsort(except, n_except, cmp_int);
-
-                        for (size_t i = 0; i < n_except; i++) {
-                                int start = i == 0 ? 2 : MAX(except[i-1], 2); /* The first three fds shall always remain open */
-                                int end = MAX(except[i], 2);
-
-                                assert(end >= start);
-
-                                if (end - start <= 1)
-                                        continue;
-
-                                /* Close everything between the start and end fds (both of which shall stay open) */
-                                if (close_range(start + 1, end - 1, 0) < 0) {
-                                        if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
-                                                have_close_range = false;
-                                        else
-                                                return -errno;
-                                        goto opendir_fallback;
-                                }
-                        }
-
-                        /* The loop succeeded. Let's now close everything beyond the end */
-
-                        if (except[n_except-1] >= INT_MAX) /* Don't let the addition below overflow */
-                                return 0;
-
-                        int start = MAX(except[n_except-1], 2);
-
-                        if (close_range(start + 1, -1, 0) >= 0)
-                                return 0;
-
-                        if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
-                                have_close_range = false;
-                        else
-                                return -errno;
-                }
-        }
-
-        /* Fallback for when close_range() is not supported */
- opendir_fallback:
-        d = allow_alloc ? opendir("/proc/self/fd") : NULL;
-        if (d) {
-                struct dirent *de;
-
-                FOREACH_DIRENT(de, d, return -errno) {
-                        int fd = -1, q;
-
-                        if (safe_atoi(de->d_name, &fd) < 0)
-                                /* Let's better ignore this, just in case */
-                                continue;
-
-                        if (fd < 3)
-                                continue;
-
-                        if (fd == dirfd(d))
-                                continue;
-
-                        if (fd_in_set(fd, except, n_except))
-                                continue;
-
-                        q = close_nointr(fd);
-                        if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
-                                r = q;
-                }
-
-                return r;
-        }
-
-        /* Fallback for when /proc isn't available (for example in chroots) or when we cannot allocate by
-         * brute-forcing through the file descriptor table. */
-
-        int max_fd = get_max_fd();
+        max_fd = get_max_fd();
        if (max_fd < 0)
                return max_fd;

@ -313,7 +226,7 @@ int close_all_fds_full(int except[], size_t n_except, bool allow_alloc) {
         * spin the CPU for a long time. */
        if (max_fd > MAX_FD_LOOP_LIMIT)
                return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
-                                       "/proc/self/fd is inaccessible. Refusing to loop over %d potential fds.",
+                                       "Refusing to loop over %d potential fds.",
                                       max_fd);

        for (int fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -1) {
@ -330,6 +243,179 @@ int close_all_fds_full(int except[], size_t n_except, bool allow_alloc) {
        return r;
 }

+static bool have_close_range = true; /* Assume we live in the future */
+
+static int close_all_fds_special_case(const int except[], size_t n_except) {
+        assert(n_except == 0 || except);
+
+        /* Handles a few common special cases separately, since they are common and can be optimized really
+         * nicely, since we won't need sorting for them. Returns > 0 if the special casing worked, 0
+         * otherwise. */
+
+        if (!have_close_range)
+                return 0;
+
+        switch (n_except) {
+
+        case 0:
+                /* Close everything. Yay! */
+
+                if (close_range(3, -1, 0) >= 0)
+                        return 1;
+
+                if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
+                        have_close_range = false;
+                        return 0;
+                }
+
+                return -errno;
+
+        case 1:
+                /* Close all but exactly one, then we don't need no sorting. This is a pretty common
+                 * case, hence let's handle it specially. */
+
+                if ((except[0] <= 3 || close_range(3, except[0]-1, 0) >= 0) &&
+                    (except[0] >= INT_MAX || close_range(MAX(3, except[0]+1), -1, 0) >= 0))
+                        return 1;
+
+                if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
+                        have_close_range = false;
+                        return 0;
+                }
+
+                return -errno;
+
+        default:
+                return 0;
+        }
+}
+
+int close_all_fds_without_malloc(const int except[], size_t n_except) {
+        int r;
+
+        assert(n_except == 0 || except);
+
+        r = close_all_fds_special_case(except, n_except);
+        if (r < 0)
+                return r;
+        if (r > 0) /* special case worked! */
+                return 0;
+
+        return close_all_fds_frugal(except, n_except);
+}
+
+int close_all_fds(const int except[], size_t n_except) {
+        _cleanup_closedir_ DIR *d = NULL;
+        struct dirent *de;
+        int r = 0;
+
+        assert(n_except == 0 || except);
+
+        r = close_all_fds_special_case(except, n_except);
+        if (r < 0)
+                return r;
+        if (r > 0) /* special case worked! */
+                return 0;
+
+        if (have_close_range) {
+                _cleanup_free_ int *sorted_malloc = NULL;
+                size_t n_sorted;
+                int *sorted;
+
+                /* In the best case we have close_range() to close all fds between a start and an end fd,
+                 * which we can use on the "inverted" exception array, i.e. all intervals between all
+                 * adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
+                 * where n is number of open fds to O(m⋅log(m)) where m is the number of fds to keep
+                 * open. Given that we assume n ≫ m that's preferable to us. */
+
+                assert(n_except < SIZE_MAX);
+                n_sorted = n_except + 1;
+
+                if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */
+                        sorted = sorted_malloc = new(int, n_sorted);
+                else
+                        sorted = newa(int, n_sorted);
+
+                if (sorted) {
+                        memcpy(sorted, except, n_except * sizeof(int));
+
+                        /* Let's add fd 2 to the list of fds, to simplify the loop below, as this
+                         * allows us to cover the head of the array the same way as the body */
+                        sorted[n_sorted-1] = 2;
+
+                        typesafe_qsort(sorted, n_sorted, cmp_int);
+
+                        for (size_t i = 0; i < n_sorted-1; i++) {
+                                int start, end;
+
+                                start = MAX(sorted[i], 2); /* The first three fds shall always remain open */
+                                end = MAX(sorted[i+1], 2);
+
+                                assert(end >= start);
+
+                                if (end - start <= 1)
+                                        continue;
+
+                                /* Close everything between the start and end fds (both of which shall stay open) */
+                                if (close_range(start + 1, end - 1, 0) < 0) {
+                                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                                return -errno;
+
+                                        have_close_range = false;
+                                        break;
+                                }
+                        }
+
+                        if (have_close_range) {
+                                /* The loop succeeded. Let's now close everything beyond the end */
+
+                                if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
+                                        return 0;
+
+                                if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)
+                                        return 0;
+
+                                if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                        return -errno;
+
+                                have_close_range = false;
+                        }
+                }
+
+                /* Fallback on OOM or if close_range() is not supported */
+        }
+
+        d = opendir("/proc/self/fd");
+        if (!d)
+                return close_all_fds_frugal(except, n_except); /* ultimate fallback if /proc/ is not available */
+
+        FOREACH_DIRENT(de, d, return -errno) {
+                int fd = -1, q;
+
+                if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN))
+                        continue;
+
+                if (safe_atoi(de->d_name, &fd) < 0)
+                        /* Let's better ignore this, just in case */
+                        continue;
+
+                if (fd < 3)
+                        continue;
+
+                if (fd == dirfd(d))
+                        continue;
+
+                if (fd_in_set(fd, except, n_except))
+                        continue;
+
+                q = close_nointr(fd);
+                if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
+                        r = q;
+        }
+
+        return r;
+}
+
 int same_fd(int a, int b) {
        struct stat sta, stb;
        pid_t pid;
--- a/src/basic/fd-util.h
+++ b/src/basic/fd-util.h
@ -57,10 +57,10 @@ DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(DIR*, closedir, NULL);
 int fd_nonblock(int fd, bool nonblock);
 int fd_cloexec(int fd, bool cloexec);

-int close_all_fds_full(int except[], size_t n_except, bool allow_alloc);
-static inline int close_all_fds(int except[], size_t n_except) {
-        return close_all_fds_full(except, n_except, true);
-}
+int get_max_fd(void);
+
+int close_all_fds(const int except[], size_t n_except);
+int close_all_fds_without_malloc(const int except[], size_t n_except);

 int same_fd(int a, int b);

--- a/src/basic/process-util.c
+++ b/src/basic/process-util.c
@ -1246,7 +1246,7 @@ static void restore_sigsetp(sigset_t **ssp) {

 int safe_fork_full(
                const char *name,
-                int except_fds[],
+                const int except_fds[],
                size_t n_except_fds,
                ForkFlags flags,
                pid_t *ret_pid) {
@ -1441,7 +1441,7 @@ int safe_fork_full(
 int namespace_fork(
                const char *outer_name,
                const char *inner_name,
-                int except_fds[],
+                const int except_fds[],
                size_t n_except_fds,
                ForkFlags flags,
                int pidns_fd,
@ -1457,8 +1457,7 @@ int namespace_fork(
         * process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
         * /proc/self/fd works correctly. */

-        r = safe_fork_full(outer_name, except_fds, n_except_fds,
-                           (flags|FORK_DEATHSIG) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
+        r = safe_fork_full(outer_name, except_fds, n_except_fds, (flags|FORK_DEATHSIG) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
        if (r < 0)
                return r;
        if (r == 0) {
@ -1621,8 +1620,10 @@ bool invoked_as(char *argv[], const char *token) {
 _noreturn_ void freeze(void) {
        log_close();

-        /* Make sure nobody waits for us on a socket anymore */
-        (void) close_all_fds_full(NULL, 0, false);
+        /* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
+         * close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
+         * to be compatible with being called from signal handlers. */
+        (void) close_all_fds_without_malloc(NULL, 0);

        /* Let's not freeze right away, but keep reaping zombies. */
        for (;;) {
--- a/src/basic/process-util.h
+++ b/src/basic/process-util.h
@ -166,13 +166,13 @@ typedef enum ForkFlags {
        FORK_NEW_USERNS         = 1 << 13, /* Run child in its own user namespace */
 } ForkFlags;

-int safe_fork_full(const char *name, int except_fds[], size_t n_except_fds, ForkFlags flags, pid_t *ret_pid);
+int safe_fork_full(const char *name, const int except_fds[], size_t n_except_fds, ForkFlags flags, pid_t *ret_pid);

 static inline int safe_fork(const char *name, ForkFlags flags, pid_t *ret_pid) {
        return safe_fork_full(name, NULL, 0, flags, ret_pid);
 }

-int namespace_fork(const char *outer_name, const char *inner_name, int except_fds[], size_t n_except_fds, ForkFlags flags, int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd, pid_t *ret_pid);
+int namespace_fork(const char *outer_name, const char *inner_name, const int except_fds[], size_t n_except_fds, ForkFlags flags, int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd, pid_t *ret_pid);

 int set_oom_score_adjust(int value);
 int get_oom_score_adjust(int *ret);
--- a/src/shared/exec-util.c
+++ b/src/shared/exec-util.c
@ -471,7 +471,7 @@ int fexecve_or_execve(int executable_fd, const char *executable, char *const arg
        return -errno;
 }

-int fork_agent(const char *name, int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) {
+int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) {
        bool stdout_is_tty, stderr_is_tty;
        size_t n, i;
        va_list ap;
--- a/src/shared/exec-util.h
+++ b/src/shared/exec-util.h
@ -49,4 +49,4 @@ ExecCommandFlags exec_command_flags_from_string(const char *s);

 int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]);

-int fork_agent(const char *name, int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) _sentinel_;
+int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) _sentinel_;
--- a/src/test/meson.build
+++ b/src/test/meson.build
@ -227,7 +227,9 @@ tests += [

        [['src/test/test-proc-cmdline.c']],

-        [['src/test/test-fd-util.c']],
+        [['src/test/test-fd-util.c'],
+         [],
+         [libseccomp]],

        [['src/test/test-web-util.c']],

--- a/src/test/test-fd-util.c
+++ b/src/test/test-fd-util.c
@ -9,10 +9,13 @@
 #include "fileio.h"
 #include "macro.h"
 #include "memory-util.h"
+#include "missing_syscall.h"
+#include "mount-util.h"
 #include "path-util.h"
 #include "process-util.h"
 #include "random-util.h"
 #include "rlimit-util.h"
+#include "seccomp-util.h"
 #include "serialize.h"
 #include "string-util.h"
 #include "tests.h"
@ -213,20 +216,29 @@ static size_t validate_fds(
        return c; /* Return number of fds >= 0 in the array */
 }

-static void test_close_all_fds(void) {
+static void test_close_all_fds_inner(void) {
        _cleanup_free_ int *fds = NULL, *keep = NULL;
-        struct rlimit rl;
        size_t n_fds, n_keep;
+        int max_fd;

        log_info("/* %s */", __func__);

        rlimit_nofile_bump(-1);

-        assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0);
-        assert_se(rl.rlim_cur > 10);
+        max_fd = get_max_fd();
+        assert_se(max_fd > 10);
+
+        if (max_fd > 7000) {
+                /* If the worst fallback is activated we need to iterate through all possible fds, hence,
+                 * let's lower the limit a small bit, so that we don't run for too long. Yes, this undoes the
+                 * rlimit_nofile_bump() call above partially. */
+
+                (void) setrlimit_closest(RLIMIT_NOFILE, &(struct rlimit) { 7000, 7000 });
+                max_fd = 7000;
+        }

        /* Try to use 5000 fds, but when we can't bump the rlimit to make that happen use the whole limit minus 10 */
-        n_fds = MIN((rl.rlim_cur & ~1U) - 10U, 5000U);
+        n_fds = MIN(((size_t) max_fd & ~1U) - 10U, 5000U);
        assert_se((n_fds & 1U) == 0U); /* make sure even number of fds */

        /* Allocate the determined number of fds, always two at a time */
@ -278,6 +290,99 @@ static void test_close_all_fds(void) {
        log_open();
 }

+static int seccomp_prohibit_close_range(void) {
+#if defined(HAVE_SECCOMP) && defined(__SNR_close_range)
+        _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+        int r;
+
+        r = seccomp_init_for_arch(&seccomp, SCMP_ARCH_NATIVE, SCMP_ACT_ALLOW);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to acquire seccomp context, ignoring: %m");
+
+        r = seccomp_rule_add_exact(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(close_range),
+                        0);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to add close_range() rule, ignoring: %m");
+
+        r = seccomp_load(seccomp);
+        if (r < 0)
+                return log_warning_errno(r, "Failed to apply close_range() restrictions, ignoring: %m");
+
+        return 0;
+#else
+        return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Seccomp support or close_range() syscall definition not availeble.");
+#endif
+}
+
+static void test_close_all_fds(void) {
+        int r;
+
+        /* Runs the test four times. Once as is. Once with close_range() syscall blocked via seccomp, once
+         * with /proc overmounted, and once with the combination of both. This should trigger all fallbacks in
+         * the close_range_all() function. */
+
+        r = safe_fork("(caf-plain)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
+        if (r == 0) {
+                test_close_all_fds_inner();
+                _exit(EXIT_SUCCESS);
+        }
+        assert_se(r >= 0);
+
+        if (geteuid() != 0) {
+                log_notice("Lacking privileges, skipping running tests with blocked close_range() and with /proc/ overnmounted.");
+                return;
+        }
+
+        r = safe_fork("(caf-noproc)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL);
+        if (r == 0) {
+                r = mount_nofollow_verbose(LOG_WARNING, "tmpfs", "/proc", "tmpfs", 0, NULL);
+                if (r < 0)
+                        log_notice("Overmounting /proc didn#t work, skipping close_all_fds() with masked /proc/.");
+                else
+                        test_close_all_fds_inner();
+                _exit(EXIT_SUCCESS);
+        }
+        assert_se(r >= 0);
+
+        if (!is_seccomp_available()) {
+                log_notice("Seccomp not available, skipping seccomp tests in %s", __func__);
+                return;
+        }
+
+        r = safe_fork("(caf-seccomp)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
+        if (r == 0) {
+                r = seccomp_prohibit_close_range();
+                if (r < 0)
+                        log_notice("Applying seccomp filter didn't work, skipping close_all_fds() test with masked close_range().");
+                else
+                        test_close_all_fds_inner();
+
+                _exit(EXIT_SUCCESS);
+        }
+        assert_se(r >= 0);
+
+        r = safe_fork("(caf-scnp)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL);
+        if (r == 0) {
+                r = seccomp_prohibit_close_range();
+                if (r < 0)
+                        log_notice("Applying seccomp filter didn't work, skipping close_all_fds() test with masked close_range().");
+                else {
+                        r = mount_nofollow_verbose(LOG_WARNING, "tmpfs", "/proc", "tmpfs", 0, NULL);
+                        if (r < 0)
+                                log_notice("Overmounting /proc didn#t work, skipping close_all_fds() with masked /proc/.");
+                        else
+                                test_close_all_fds_inner();
+                }
+
+                test_close_all_fds_inner();
+                _exit(EXIT_SUCCESS);
+        }
+        assert_se(r >= 0);
+}
+
 static void test_format_proc_fd_path(void) {
        assert_se(streq_ptr(FORMAT_PROC_FD_PATH(0), "/proc/self/fd/0"));
        assert_se(streq_ptr(FORMAT_PROC_FD_PATH(1), "/proc/self/fd/1"));