mirror of
https://github.com/systemd/systemd.git
synced 2025-03-08 08:58:27 +03:00
Merge pull request #17338 from poettering/close-range
make use of new kernel 5.9 close_range() syscall in close_all_fds()
This commit is contained in:
commit
7848cb8c57
2
TODO
2
TODO
@ -24,8 +24,6 @@ Features:
|
||||
|
||||
* in fd_get_path() if we see (deleted) then do stat and check for st_nlink
|
||||
|
||||
* add support for close_range() added in kernel 5.9
|
||||
|
||||
* Add service setting to run a service within the specified VRF. i.e. do the
|
||||
equivalent of "ip vrf exec".
|
||||
|
||||
|
@ -533,6 +533,7 @@ foreach ident : [
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>'''],
|
||||
['mallinfo', '''#include <malloc.h>'''],
|
||||
['close_range', '''#include <unistd.h>'''],
|
||||
]
|
||||
|
||||
have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
|
||||
|
@ -27,7 +27,7 @@ typedef void (*free_func_t)(void *p);
|
||||
size_t _n_ = n; \
|
||||
assert(!size_multiply_overflow(sizeof(t), _n_)); \
|
||||
assert(sizeof(t)*_n_ <= ALLOCA_MAX); \
|
||||
(t*) alloca(sizeof(t)*_n_); \
|
||||
(t*) alloca((sizeof(t)*_n_) ?: 1); \
|
||||
})
|
||||
|
||||
#define newa0(t, n) \
|
||||
@ -35,14 +35,14 @@ typedef void (*free_func_t)(void *p);
|
||||
size_t _n_ = n; \
|
||||
assert(!size_multiply_overflow(sizeof(t), _n_)); \
|
||||
assert(sizeof(t)*_n_ <= ALLOCA_MAX); \
|
||||
(t*) alloca0(sizeof(t)*_n_); \
|
||||
(t*) alloca0((sizeof(t)*_n_) ?: 1); \
|
||||
})
|
||||
|
||||
#define newdup(t, p, n) ((t*) memdup_multiply(p, sizeof(t), (n)))
|
||||
|
||||
#define newdup_suffix0(t, p, n) ((t*) memdup_suffix0_multiply(p, sizeof(t), (n)))
|
||||
|
||||
#define malloc0(n) (calloc(1, (n)))
|
||||
#define malloc0(n) (calloc(1, (n) ?: 1))
|
||||
|
||||
static inline void *mfree(void *memory) {
|
||||
free(memory);
|
||||
@ -65,7 +65,7 @@ void* memdup_suffix0(const void *p, size_t l); /* We can't use _alloc_() here, s
|
||||
void *_q_; \
|
||||
size_t _l_ = l; \
|
||||
assert(_l_ <= ALLOCA_MAX); \
|
||||
_q_ = alloca(_l_); \
|
||||
_q_ = alloca(_l_ ?: 1); \
|
||||
memcpy(_q_, p, _l_); \
|
||||
})
|
||||
|
||||
@ -135,7 +135,7 @@ void* greedy_realloc0(void **p, size_t *allocated, size_t need, size_t size);
|
||||
char *_new_; \
|
||||
size_t _len_ = n; \
|
||||
assert(_len_ <= ALLOCA_MAX); \
|
||||
_new_ = alloca(_len_); \
|
||||
_new_ = alloca(_len_ ?: 1); \
|
||||
(void *) memset(_new_, 0, _len_); \
|
||||
})
|
||||
|
||||
@ -146,7 +146,7 @@ void* greedy_realloc0(void **p, size_t *allocated, size_t need, size_t size);
|
||||
size_t _mask_ = (align) - 1; \
|
||||
size_t _size_ = size; \
|
||||
assert(_size_ <= ALLOCA_MAX); \
|
||||
_ptr_ = alloca(_size_ + _mask_); \
|
||||
_ptr_ = alloca((_size_ + _mask_) ?: 1); \
|
||||
(void*)(((uintptr_t)_ptr_ + _mask_) & ~_mask_); \
|
||||
})
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
#include "socket-util.h"
|
||||
#include "sort-util.h"
|
||||
#include "stat-util.h"
|
||||
#include "stdio-util.h"
|
||||
#include "tmpfile-util.h"
|
||||
@ -210,13 +211,102 @@ static int get_max_fd(void) {
|
||||
return (int) (m - 1);
|
||||
}
|
||||
|
||||
static int cmp_int(const int *a, const int *b) {
|
||||
return CMP(*a, *b);
|
||||
}
|
||||
|
||||
int close_all_fds(const int except[], size_t n_except) {
|
||||
static bool have_close_range = true; /* Assume we live in the future */
|
||||
_cleanup_closedir_ DIR *d = NULL;
|
||||
struct dirent *de;
|
||||
int r = 0;
|
||||
|
||||
assert(n_except == 0 || except);
|
||||
|
||||
if (have_close_range) {
|
||||
/* In the best case we have close_range() to close all fds between a start and an end fd,
|
||||
* which we can use on the "inverted" exception array, i.e. all intervals between all
|
||||
* adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
|
||||
* where n is number of open fds to O(m⋅log(m)) where m is the number of fds to keep
|
||||
* open. Given that we assume n ≫ m that's preferable to us. */
|
||||
|
||||
if (n_except == 0) {
|
||||
/* Close everything. Yay! */
|
||||
|
||||
if (close_range(3, -1, 0) >= 0)
|
||||
return 1;
|
||||
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
|
||||
return -errno;
|
||||
|
||||
have_close_range = false;
|
||||
} else {
|
||||
_cleanup_free_ int *sorted_malloc = NULL;
|
||||
size_t n_sorted;
|
||||
int *sorted;
|
||||
|
||||
assert(n_except < SIZE_MAX);
|
||||
n_sorted = n_except + 1;
|
||||
|
||||
if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */
|
||||
sorted = sorted_malloc = new(int, n_sorted);
|
||||
else
|
||||
sorted = newa(int, n_sorted);
|
||||
|
||||
if (sorted) {
|
||||
int c = 0;
|
||||
|
||||
memcpy(sorted, except, n_except * sizeof(int));
|
||||
|
||||
/* Let's add fd 2 to the list of fds, to simplify the loop below, as this
|
||||
* allows us to cover the head of the array the same way as the body */
|
||||
sorted[n_sorted-1] = 2;
|
||||
|
||||
typesafe_qsort(sorted, n_sorted, cmp_int);
|
||||
|
||||
for (size_t i = 0; i < n_sorted-1; i++) {
|
||||
int start, end;
|
||||
|
||||
start = MAX(sorted[i], 2); /* The first three fds shall always remain open */
|
||||
end = MAX(sorted[i+1], 2);
|
||||
|
||||
assert(end >= start);
|
||||
|
||||
if (end - start <= 1)
|
||||
continue;
|
||||
|
||||
/* Close everything between the start and end fds (both of which shall stay open) */
|
||||
if (close_range(start + 1, end - 1, 0) < 0) {
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
|
||||
return -errno;
|
||||
|
||||
have_close_range = false;
|
||||
break;
|
||||
}
|
||||
|
||||
c += end - start - 1;
|
||||
}
|
||||
|
||||
if (have_close_range) {
|
||||
/* The loop succeeded. Let's now close everything beyond the end */
|
||||
|
||||
if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
|
||||
return c;
|
||||
|
||||
if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)
|
||||
return c + 1;
|
||||
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
|
||||
return -errno;
|
||||
|
||||
have_close_range = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Fallback on OOM or if close_range() is not supported */
|
||||
}
|
||||
|
||||
d = opendir("/proc/self/fd");
|
||||
if (!d) {
|
||||
int fd, max_fd;
|
||||
|
@ -734,3 +734,49 @@ static inline int missing_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *info)
|
||||
|
||||
# define rt_sigqueueinfo missing_rt_sigqueueinfo
|
||||
#endif
|
||||
|
||||
/* ======================================================================= */
|
||||
|
||||
#define systemd_NR_close_range systemd_SC_arch_bias(436)
|
||||
|
||||
/* may be (invalid) negative number due to libseccomp, see PR 13319 */
|
||||
#if defined __NR_close_range && __NR_close_range >= 0
|
||||
# if defined systemd_NR_close_range
|
||||
assert_cc(__NR_close_range == systemd_NR_close_range);
|
||||
# endif
|
||||
#else
|
||||
# if defined __NR_close_range
|
||||
# undef __NR_close_range
|
||||
# endif
|
||||
# if defined systemd_NR_close_range
|
||||
# define __NR_close_range systemd_NR_close_range
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if !HAVE_CLOSE_RANGE
|
||||
static inline int missing_close_range(int first_fd, int end_fd, unsigned flags) {
|
||||
# ifdef __NR_close_range
|
||||
/* Kernel-side the syscall expects fds as unsigned integers (just like close() actually), while
|
||||
* userspace exclusively uses signed integers for fds. We don't know just yet how glibc is going to
|
||||
* wrap this syscall, but let's assume it's going to be similar to what they do for close(),
|
||||
* i.e. make the same unsigned → signed type change from the raw kernel syscall compared to the
|
||||
* userspace wrapper. There's only one caveat for this: unlike for close() there's the special
|
||||
* UINT_MAX fd value for the 'end_fd' argument. Let's safely map that to -1 here. And let's refuse
|
||||
* any other negative values. */
|
||||
if ((first_fd < 0) || (end_fd < 0 && end_fd != -1)) {
|
||||
errno = -EBADF;
|
||||
return -1;
|
||||
}
|
||||
|
||||
return syscall(__NR_close_range,
|
||||
(unsigned) first_fd,
|
||||
end_fd == -1 ? UINT_MAX : (unsigned) end_fd, /* Of course, the compiler should figure out that this is the identity mapping IRL */
|
||||
flags);
|
||||
# else
|
||||
errno = ENOSYS;
|
||||
return -1;
|
||||
# endif
|
||||
}
|
||||
|
||||
# define close_range missing_close_range
|
||||
#endif
|
||||
|
@ -345,6 +345,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
||||
.value =
|
||||
"_llseek\0"
|
||||
"close\0"
|
||||
"close_range\0"
|
||||
"dup\0"
|
||||
"dup2\0"
|
||||
"dup3\0"
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
#include "random-util.h"
|
||||
#include "rlimit-util.h"
|
||||
#include "serialize.h"
|
||||
#include "string-util.h"
|
||||
#include "tests.h"
|
||||
@ -317,6 +318,100 @@ static void test_read_nr_open(void) {
|
||||
log_info("nr-open: %i", read_nr_open());
|
||||
}
|
||||
|
||||
static size_t validate_fds(
|
||||
bool opened,
|
||||
const int *fds,
|
||||
size_t n_fds) {
|
||||
|
||||
size_t c = 0;
|
||||
|
||||
/* Validates that fds in the specified array are one of the following three:
|
||||
*
|
||||
* 1. < 0 (test is skipped) or
|
||||
* 2. opened (if 'opened' param is true) or
|
||||
* 3. closed (if 'opened' param is false)
|
||||
*/
|
||||
|
||||
for (size_t i = 0; i < n_fds; i++) {
|
||||
if (fds[i] < 0)
|
||||
continue;
|
||||
|
||||
if (opened)
|
||||
assert_se(fcntl(fds[i], F_GETFD) >= 0);
|
||||
else
|
||||
assert_se(fcntl(fds[i], F_GETFD) < 0 && errno == EBADF);
|
||||
|
||||
c++;
|
||||
}
|
||||
|
||||
return c; /* Return number of fds >= 0 in the array */
|
||||
}
|
||||
|
||||
static void test_close_all_fds(void) {
|
||||
_cleanup_free_ int *fds = NULL, *keep = NULL;
|
||||
struct rlimit rl;
|
||||
size_t n_fds, n_keep;
|
||||
|
||||
log_info("/* %s */", __func__);
|
||||
|
||||
rlimit_nofile_bump(-1);
|
||||
|
||||
assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0);
|
||||
assert_se(rl.rlim_cur > 10);
|
||||
|
||||
/* Try to use 5000 fds, but when we can't bump the rlimit to make that happen use the whole limit minus 10 */
|
||||
n_fds = MIN((rl.rlim_cur & ~1U) - 10U, 5000U);
|
||||
assert_se((n_fds & 1U) == 0U); /* make sure even number of fds */
|
||||
|
||||
/* Allocate the determined number of fds, always two at a time */
|
||||
assert_se(fds = new(int, n_fds));
|
||||
for (size_t i = 0; i < n_fds; i += 2)
|
||||
assert_se(pipe2(fds + i, O_CLOEXEC) >= 0);
|
||||
|
||||
/* Validate this worked */
|
||||
assert_se(validate_fds(true, fds, n_fds) == n_fds);
|
||||
|
||||
/* Randomized number of fds to keep, but at most every second */
|
||||
n_keep = (random_u64() % (n_fds / 2));
|
||||
|
||||
/* Now randomly select a number of fds from the array above to keep */
|
||||
assert_se(keep = new(int, n_keep));
|
||||
for (size_t k = 0; k < n_keep; k++) {
|
||||
for (;;) {
|
||||
size_t p;
|
||||
|
||||
p = random_u64() % n_fds;
|
||||
if (fds[p] >= 0) {
|
||||
keep[k] = TAKE_FD(fds[p]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Check that all fds from both arrays are still open, and test how many in each are >= 0 */
|
||||
assert_se(validate_fds(true, fds, n_fds) == n_fds - n_keep);
|
||||
assert_se(validate_fds(true, keep, n_keep) == n_keep);
|
||||
|
||||
/* Close logging fd first, so that we don't confuse it by closing its fd */
|
||||
log_close();
|
||||
log_set_open_when_needed(true);
|
||||
|
||||
/* Close all but the ones to keep */
|
||||
assert_se(close_all_fds(keep, n_keep) >= 0);
|
||||
|
||||
assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep);
|
||||
assert_se(validate_fds(true, keep, n_keep) == n_keep);
|
||||
|
||||
/* Close everything else too! */
|
||||
assert_se(close_all_fds(NULL, 0) >= 0);
|
||||
|
||||
assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep);
|
||||
assert_se(validate_fds(false, keep, n_keep) == n_keep);
|
||||
|
||||
log_set_open_when_needed(false);
|
||||
log_open();
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
test_setup_logging(LOG_DEBUG);
|
||||
@ -330,6 +425,7 @@ int main(int argc, char *argv[]) {
|
||||
test_rearrange_stdio();
|
||||
test_fd_duplicate_data_fd();
|
||||
test_read_nr_open();
|
||||
test_close_all_fds();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user