Merge pull request #17338 from poettering/close-range

make use of new kernel 5.9 close_range() syscall in close_all_fds()
2025-03-08 08:58:27 +03:00 · 2020-10-14 17:22:15 +02:00 · 2020-10-14 17:22:15 +02:00 · 7848cb8c57
commit 7848cb8c57
parent fd8f865c9f eaa2751685
7 changed files with 240 additions and 8 deletions
--- a/2
+++ b/2
@ -24,8 +24,6 @@ Features:

 * in fd_get_path() if we see (deleted) then do stat and check for st_nlink

-* add support for close_range() added in kernel 5.9
-
 * Add service setting to run a service within the specified VRF. i.e. do the
  equivalent of "ip vrf exec".

--- a/meson.build
+++ b/meson.build
@ -533,6 +533,7 @@ foreach ident : [
                                 #include <signal.h>
                                 #include <sys/wait.h>'''],
        ['mallinfo',          '''#include <malloc.h>'''],
+        ['close_range',       '''#include <unistd.h>'''],
 ]

        have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
--- a/src/basic/alloc-util.h
+++ b/src/basic/alloc-util.h
@ -27,7 +27,7 @@ typedef void (*free_func_t)(void *p);
                size_t _n_ = n;                                         \
                assert(!size_multiply_overflow(sizeof(t), _n_));        \
                assert(sizeof(t)*_n_ <= ALLOCA_MAX);                    \
-                (t*) alloca(sizeof(t)*_n_);                             \
+                (t*) alloca((sizeof(t)*_n_) ?: 1);                      \
        })

 #define newa0(t, n)                                                     \
@ -35,14 +35,14 @@ typedef void (*free_func_t)(void *p);
                size_t _n_ = n;                                         \
                assert(!size_multiply_overflow(sizeof(t), _n_));        \
                assert(sizeof(t)*_n_ <= ALLOCA_MAX);                    \
-                (t*) alloca0(sizeof(t)*_n_);                            \
+                (t*) alloca0((sizeof(t)*_n_) ?: 1);                     \
        })

 #define newdup(t, p, n) ((t*) memdup_multiply(p, sizeof(t), (n)))

 #define newdup_suffix0(t, p, n) ((t*) memdup_suffix0_multiply(p, sizeof(t), (n)))

-#define malloc0(n) (calloc(1, (n)))
+#define malloc0(n) (calloc(1, (n) ?: 1))

 static inline void *mfree(void *memory) {
        free(memory);
@ -65,7 +65,7 @@ void* memdup_suffix0(const void *p, size_t l); /* We can't use _alloc_() here, s
                void *_q_;                      \
                size_t _l_ = l;                 \
                assert(_l_ <= ALLOCA_MAX);      \
-                _q_ = alloca(_l_);              \
+                _q_ = alloca(_l_ ?: 1);         \
                memcpy(_q_, p, _l_);            \
        })

@ -135,7 +135,7 @@ void* greedy_realloc0(void **p, size_t *allocated, size_t need, size_t size);
                char *_new_;                            \
                size_t _len_ = n;                       \
                assert(_len_ <= ALLOCA_MAX);            \
-                _new_ = alloca(_len_);                  \
+                _new_ = alloca(_len_ ?: 1);             \
                (void *) memset(_new_, 0, _len_);       \
        })

@ -146,7 +146,7 @@ void* greedy_realloc0(void **p, size_t *allocated, size_t need, size_t size);
                size_t _mask_ = (align) - 1;                            \
                size_t _size_ = size;                                   \
                assert(_size_ <= ALLOCA_MAX);                           \
-                _ptr_ = alloca(_size_ + _mask_);                        \
+                _ptr_ = alloca((_size_ + _mask_) ?: 1);                 \
                (void*)(((uintptr_t)_ptr_ + _mask_) & ~_mask_);         \
        })

--- a/src/basic/fd-util.c
+++ b/src/basic/fd-util.c
@ -21,6 +21,7 @@
 #include "path-util.h"
 #include "process-util.h"
 #include "socket-util.h"
+#include "sort-util.h"
 #include "stat-util.h"
 #include "stdio-util.h"
 #include "tmpfile-util.h"
@ -210,13 +211,102 @@ static int get_max_fd(void) {
        return (int) (m - 1);
 }

+static int cmp_int(const int *a, const int *b) {
+        return CMP(*a, *b);
+}
+
 int close_all_fds(const int except[], size_t n_except) {
+        static bool have_close_range = true; /* Assume we live in the future */
        _cleanup_closedir_ DIR *d = NULL;
        struct dirent *de;
        int r = 0;

        assert(n_except == 0 || except);

+        if (have_close_range) {
+                /* In the best case we have close_range() to close all fds between a start and an end fd,
+                 * which we can use on the "inverted" exception array, i.e. all intervals between all
+                 * adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
+                 * where n is number of open fds to O(m⋅log(m)) where m is the number of fds to keep
+                 * open. Given that we assume n ≫ m that's preferable to us. */
+
+                if (n_except == 0) {
+                        /* Close everything. Yay! */
+
+                        if (close_range(3, -1, 0) >= 0)
+                                return 1;
+
+                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                return -errno;
+
+                        have_close_range = false;
+                } else {
+                        _cleanup_free_ int *sorted_malloc = NULL;
+                        size_t n_sorted;
+                        int *sorted;
+
+                        assert(n_except < SIZE_MAX);
+                        n_sorted = n_except + 1;
+
+                        if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */
+                                sorted = sorted_malloc = new(int, n_sorted);
+                        else
+                                sorted = newa(int, n_sorted);
+
+                        if (sorted) {
+                                int c = 0;
+
+                                memcpy(sorted, except, n_except * sizeof(int));
+
+                                /* Let's add fd 2 to the list of fds, to simplify the loop below, as this
+                                 * allows us to cover the head of the array the same way as the body */
+                                sorted[n_sorted-1] = 2;
+
+                                typesafe_qsort(sorted, n_sorted, cmp_int);
+
+                                for (size_t i = 0; i < n_sorted-1; i++) {
+                                        int start, end;
+
+                                        start = MAX(sorted[i], 2); /* The first three fds shall always remain open */
+                                        end = MAX(sorted[i+1], 2);
+
+                                        assert(end >= start);
+
+                                        if (end - start <= 1)
+                                                continue;
+
+                                        /* Close everything between the start and end fds (both of which shall stay open) */
+                                        if (close_range(start + 1, end - 1, 0) < 0) {
+                                                if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                                        return -errno;
+
+                                                have_close_range = false;
+                                                break;
+                                        }
+
+                                        c += end - start - 1;
+                                }
+
+                                if (have_close_range) {
+                                        /* The loop succeeded. Let's now close everything beyond the end */
+
+                                        if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
+                                                return c;
+
+                                        if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)
+                                                return c + 1;
+
+                                        if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
+                                                return -errno;
+
+                                        have_close_range = false;
+                                }
+                        }
+                }
+
+                /* Fallback on OOM or if close_range() is not supported */
+        }
+
        d = opendir("/proc/self/fd");
        if (!d) {
                int fd, max_fd;
--- a/src/basic/missing_syscall.h
+++ b/src/basic/missing_syscall.h
@ -734,3 +734,49 @@ static inline int missing_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *info)

 #  define rt_sigqueueinfo missing_rt_sigqueueinfo
 #endif
+
+/* ======================================================================= */
+
+#define systemd_NR_close_range systemd_SC_arch_bias(436)
+
+/* may be (invalid) negative number due to libseccomp, see PR 13319 */
+#if defined __NR_close_range && __NR_close_range >= 0
+#  if defined systemd_NR_close_range
+assert_cc(__NR_close_range == systemd_NR_close_range);
+#  endif
+#else
+#  if defined __NR_close_range
+#    undef __NR_close_range
+#  endif
+#  if defined systemd_NR_close_range
+#    define __NR_close_range systemd_NR_close_range
+#  endif
+#endif
+
+#if !HAVE_CLOSE_RANGE
+static inline int missing_close_range(int first_fd, int end_fd, unsigned flags) {
+#  ifdef __NR_close_range
+        /* Kernel-side the syscall expects fds as unsigned integers (just like close() actually), while
+         * userspace exclusively uses signed integers for fds. We don't know just yet how glibc is going to
+         * wrap this syscall, but let's assume it's going to be similar to what they do for close(),
+         * i.e. make the same unsigned → signed type change from the raw kernel syscall compared to the
+         * userspace wrapper. There's only one caveat for this: unlike for close() there's the special
+         * UINT_MAX fd value for the 'end_fd' argument. Let's safely map that to -1 here. And let's refuse
+         * any other negative values. */
+        if ((first_fd < 0) || (end_fd < 0 && end_fd != -1)) {
+                errno = -EBADF;
+                return -1;
+        }
+
+        return syscall(__NR_close_range,
+                       (unsigned) first_fd,
+                       end_fd == -1 ? UINT_MAX : (unsigned) end_fd, /* Of course, the compiler should figure out that this is the identity mapping IRL */
+                       flags);
+#  else
+        errno = ENOSYS;
+        return -1;
+#  endif
+}
+
+#  define close_range missing_close_range
+#endif
--- a/src/shared/seccomp-util.c
+++ b/src/shared/seccomp-util.c
@ -345,6 +345,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                .value =
                "_llseek\0"
                "close\0"
+                "close_range\0"
                "dup\0"
                "dup2\0"
                "dup3\0"
--- a/src/test/test-fd-util.c
+++ b/src/test/test-fd-util.c
@ -11,6 +11,7 @@
 #include "path-util.h"
 #include "process-util.h"
 #include "random-util.h"
+#include "rlimit-util.h"
 #include "serialize.h"
 #include "string-util.h"
 #include "tests.h"
@ -317,6 +318,100 @@ static void test_read_nr_open(void) {
        log_info("nr-open: %i", read_nr_open());
 }

+static size_t validate_fds(
+                bool opened,
+                const int *fds,
+                size_t n_fds) {
+
+        size_t c = 0;
+
+        /* Validates that fds in the specified array are one of the following three:
+         *
+         *  1. < 0 (test is skipped) or
+         *  2. opened (if 'opened' param is true) or
+         *  3. closed (if 'opened' param is false)
+         */
+
+        for (size_t i = 0; i < n_fds; i++) {
+                if (fds[i] < 0)
+                        continue;
+
+                if (opened)
+                        assert_se(fcntl(fds[i], F_GETFD) >= 0);
+                else
+                        assert_se(fcntl(fds[i], F_GETFD) < 0 && errno == EBADF);
+
+                c++;
+        }
+
+        return c; /* Return number of fds >= 0 in the array */
+}
+
+static void test_close_all_fds(void) {
+        _cleanup_free_ int *fds = NULL, *keep = NULL;
+        struct rlimit rl;
+        size_t n_fds, n_keep;
+
+        log_info("/* %s */", __func__);
+
+        rlimit_nofile_bump(-1);
+
+        assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0);
+        assert_se(rl.rlim_cur > 10);
+
+        /* Try to use 5000 fds, but when we can't bump the rlimit to make that happen use the whole limit minus 10 */
+        n_fds = MIN((rl.rlim_cur & ~1U) - 10U, 5000U);
+        assert_se((n_fds & 1U) == 0U); /* make sure even number of fds */
+
+        /* Allocate the determined number of fds, always two at a time */
+        assert_se(fds = new(int, n_fds));
+        for (size_t i = 0; i < n_fds; i += 2)
+                assert_se(pipe2(fds + i, O_CLOEXEC) >= 0);
+
+        /* Validate this worked */
+        assert_se(validate_fds(true, fds, n_fds) == n_fds);
+
+        /* Randomized number of fds to keep, but at most every second */
+        n_keep = (random_u64() % (n_fds / 2));
+
+        /* Now randomly select a number of fds from the array above to keep */
+        assert_se(keep = new(int, n_keep));
+        for (size_t k = 0; k < n_keep; k++) {
+                for (;;) {
+                        size_t p;
+
+                        p = random_u64() % n_fds;
+                        if (fds[p] >= 0) {
+                                keep[k] = TAKE_FD(fds[p]);
+                                break;
+                        }
+                }
+        }
+
+        /* Check that all fds from both arrays are still open, and test how many in each are >= 0 */
+        assert_se(validate_fds(true, fds, n_fds) == n_fds - n_keep);
+        assert_se(validate_fds(true, keep, n_keep) == n_keep);
+
+        /* Close logging fd first, so that we don't confuse it by closing its fd */
+        log_close();
+        log_set_open_when_needed(true);
+
+        /* Close all but the ones to keep */
+        assert_se(close_all_fds(keep, n_keep) >= 0);
+
+        assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep);
+        assert_se(validate_fds(true, keep, n_keep) == n_keep);
+
+        /* Close everything else too! */
+        assert_se(close_all_fds(NULL, 0) >= 0);
+
+        assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep);
+        assert_se(validate_fds(false, keep, n_keep) == n_keep);
+
+        log_set_open_when_needed(false);
+        log_open();
+}
+
 int main(int argc, char *argv[]) {

        test_setup_logging(LOG_DEBUG);
@ -330,6 +425,7 @@ int main(int argc, char *argv[]) {
        test_rearrange_stdio();
        test_fd_duplicate_data_fd();
        test_read_nr_open();
+        test_close_all_fds();

        return 0;
 }