1
0
mirror of https://github.com/systemd/systemd.git synced 2025-03-08 08:58:27 +03:00

Merge pull request #17338 from poettering/close-range

make use of new kernel 5.9 close_range() syscall in close_all_fds()
This commit is contained in:
Lennart Poettering 2020-10-14 17:22:15 +02:00 committed by GitHub
commit 7848cb8c57
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 240 additions and 8 deletions

2
TODO
View File

@ -24,8 +24,6 @@ Features:
* in fd_get_path() if we see (deleted) then do stat and check for st_nlink
* add support for close_range() added in kernel 5.9
* Add service setting to run a service within the specified VRF. i.e. do the
equivalent of "ip vrf exec".

View File

@ -533,6 +533,7 @@ foreach ident : [
#include <signal.h>
#include <sys/wait.h>'''],
['mallinfo', '''#include <malloc.h>'''],
['close_range', '''#include <unistd.h>'''],
]
have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')

View File

@ -27,7 +27,7 @@ typedef void (*free_func_t)(void *p);
size_t _n_ = n; \
assert(!size_multiply_overflow(sizeof(t), _n_)); \
assert(sizeof(t)*_n_ <= ALLOCA_MAX); \
(t*) alloca(sizeof(t)*_n_); \
(t*) alloca((sizeof(t)*_n_) ?: 1); \
})
#define newa0(t, n) \
@ -35,14 +35,14 @@ typedef void (*free_func_t)(void *p);
size_t _n_ = n; \
assert(!size_multiply_overflow(sizeof(t), _n_)); \
assert(sizeof(t)*_n_ <= ALLOCA_MAX); \
(t*) alloca0(sizeof(t)*_n_); \
(t*) alloca0((sizeof(t)*_n_) ?: 1); \
})
#define newdup(t, p, n) ((t*) memdup_multiply(p, sizeof(t), (n)))
#define newdup_suffix0(t, p, n) ((t*) memdup_suffix0_multiply(p, sizeof(t), (n)))
#define malloc0(n) (calloc(1, (n)))
#define malloc0(n) (calloc(1, (n) ?: 1))
static inline void *mfree(void *memory) {
free(memory);
@ -65,7 +65,7 @@ void* memdup_suffix0(const void *p, size_t l); /* We can't use _alloc_() here, s
void *_q_; \
size_t _l_ = l; \
assert(_l_ <= ALLOCA_MAX); \
_q_ = alloca(_l_); \
_q_ = alloca(_l_ ?: 1); \
memcpy(_q_, p, _l_); \
})
@ -135,7 +135,7 @@ void* greedy_realloc0(void **p, size_t *allocated, size_t need, size_t size);
char *_new_; \
size_t _len_ = n; \
assert(_len_ <= ALLOCA_MAX); \
_new_ = alloca(_len_); \
_new_ = alloca(_len_ ?: 1); \
(void *) memset(_new_, 0, _len_); \
})
@ -146,7 +146,7 @@ void* greedy_realloc0(void **p, size_t *allocated, size_t need, size_t size);
size_t _mask_ = (align) - 1; \
size_t _size_ = size; \
assert(_size_ <= ALLOCA_MAX); \
_ptr_ = alloca(_size_ + _mask_); \
_ptr_ = alloca((_size_ + _mask_) ?: 1); \
(void*)(((uintptr_t)_ptr_ + _mask_) & ~_mask_); \
})

View File

@ -21,6 +21,7 @@
#include "path-util.h"
#include "process-util.h"
#include "socket-util.h"
#include "sort-util.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "tmpfile-util.h"
@ -210,13 +211,102 @@ static int get_max_fd(void) {
return (int) (m - 1);
}
static int cmp_int(const int *a, const int *b) {
return CMP(*a, *b);
}
int close_all_fds(const int except[], size_t n_except) {
static bool have_close_range = true; /* Assume we live in the future */
_cleanup_closedir_ DIR *d = NULL;
struct dirent *de;
int r = 0;
assert(n_except == 0 || except);
if (have_close_range) {
/* In the best case we have close_range() to close all fds between a start and an end fd,
* which we can use on the "inverted" exception array, i.e. all intervals between all
* adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
* where n is number of open fds to O(mlog(m)) where m is the number of fds to keep
* open. Given that we assume n m that's preferable to us. */
if (n_except == 0) {
/* Close everything. Yay! */
if (close_range(3, -1, 0) >= 0)
return 1;
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
have_close_range = false;
} else {
_cleanup_free_ int *sorted_malloc = NULL;
size_t n_sorted;
int *sorted;
assert(n_except < SIZE_MAX);
n_sorted = n_except + 1;
if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */
sorted = sorted_malloc = new(int, n_sorted);
else
sorted = newa(int, n_sorted);
if (sorted) {
int c = 0;
memcpy(sorted, except, n_except * sizeof(int));
/* Let's add fd 2 to the list of fds, to simplify the loop below, as this
* allows us to cover the head of the array the same way as the body */
sorted[n_sorted-1] = 2;
typesafe_qsort(sorted, n_sorted, cmp_int);
for (size_t i = 0; i < n_sorted-1; i++) {
int start, end;
start = MAX(sorted[i], 2); /* The first three fds shall always remain open */
end = MAX(sorted[i+1], 2);
assert(end >= start);
if (end - start <= 1)
continue;
/* Close everything between the start and end fds (both of which shall stay open) */
if (close_range(start + 1, end - 1, 0) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
have_close_range = false;
break;
}
c += end - start - 1;
}
if (have_close_range) {
/* The loop succeeded. Let's now close everything beyond the end */
if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
return c;
if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)
return c + 1;
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
have_close_range = false;
}
}
}
/* Fallback on OOM or if close_range() is not supported */
}
d = opendir("/proc/self/fd");
if (!d) {
int fd, max_fd;

View File

@ -734,3 +734,49 @@ static inline int missing_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *info)
# define rt_sigqueueinfo missing_rt_sigqueueinfo
#endif
/* ======================================================================= */
#define systemd_NR_close_range systemd_SC_arch_bias(436)
/* may be (invalid) negative number due to libseccomp, see PR 13319 */
#if defined __NR_close_range && __NR_close_range >= 0
# if defined systemd_NR_close_range
assert_cc(__NR_close_range == systemd_NR_close_range);
# endif
#else
# if defined __NR_close_range
# undef __NR_close_range
# endif
# if defined systemd_NR_close_range
# define __NR_close_range systemd_NR_close_range
# endif
#endif
#if !HAVE_CLOSE_RANGE
static inline int missing_close_range(int first_fd, int end_fd, unsigned flags) {
# ifdef __NR_close_range
/* Kernel-side the syscall expects fds as unsigned integers (just like close() actually), while
* userspace exclusively uses signed integers for fds. We don't know just yet how glibc is going to
* wrap this syscall, but let's assume it's going to be similar to what they do for close(),
* i.e. make the same unsigned signed type change from the raw kernel syscall compared to the
* userspace wrapper. There's only one caveat for this: unlike for close() there's the special
* UINT_MAX fd value for the 'end_fd' argument. Let's safely map that to -1 here. And let's refuse
* any other negative values. */
if ((first_fd < 0) || (end_fd < 0 && end_fd != -1)) {
errno = -EBADF;
return -1;
}
return syscall(__NR_close_range,
(unsigned) first_fd,
end_fd == -1 ? UINT_MAX : (unsigned) end_fd, /* Of course, the compiler should figure out that this is the identity mapping IRL */
flags);
# else
errno = ENOSYS;
return -1;
# endif
}
# define close_range missing_close_range
#endif

View File

@ -345,6 +345,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
.value =
"_llseek\0"
"close\0"
"close_range\0"
"dup\0"
"dup2\0"
"dup3\0"

View File

@ -11,6 +11,7 @@
#include "path-util.h"
#include "process-util.h"
#include "random-util.h"
#include "rlimit-util.h"
#include "serialize.h"
#include "string-util.h"
#include "tests.h"
@ -317,6 +318,100 @@ static void test_read_nr_open(void) {
log_info("nr-open: %i", read_nr_open());
}
static size_t validate_fds(
bool opened,
const int *fds,
size_t n_fds) {
size_t c = 0;
/* Validates that fds in the specified array are one of the following three:
*
* 1. < 0 (test is skipped) or
* 2. opened (if 'opened' param is true) or
* 3. closed (if 'opened' param is false)
*/
for (size_t i = 0; i < n_fds; i++) {
if (fds[i] < 0)
continue;
if (opened)
assert_se(fcntl(fds[i], F_GETFD) >= 0);
else
assert_se(fcntl(fds[i], F_GETFD) < 0 && errno == EBADF);
c++;
}
return c; /* Return number of fds >= 0 in the array */
}
static void test_close_all_fds(void) {
_cleanup_free_ int *fds = NULL, *keep = NULL;
struct rlimit rl;
size_t n_fds, n_keep;
log_info("/* %s */", __func__);
rlimit_nofile_bump(-1);
assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0);
assert_se(rl.rlim_cur > 10);
/* Try to use 5000 fds, but when we can't bump the rlimit to make that happen use the whole limit minus 10 */
n_fds = MIN((rl.rlim_cur & ~1U) - 10U, 5000U);
assert_se((n_fds & 1U) == 0U); /* make sure even number of fds */
/* Allocate the determined number of fds, always two at a time */
assert_se(fds = new(int, n_fds));
for (size_t i = 0; i < n_fds; i += 2)
assert_se(pipe2(fds + i, O_CLOEXEC) >= 0);
/* Validate this worked */
assert_se(validate_fds(true, fds, n_fds) == n_fds);
/* Randomized number of fds to keep, but at most every second */
n_keep = (random_u64() % (n_fds / 2));
/* Now randomly select a number of fds from the array above to keep */
assert_se(keep = new(int, n_keep));
for (size_t k = 0; k < n_keep; k++) {
for (;;) {
size_t p;
p = random_u64() % n_fds;
if (fds[p] >= 0) {
keep[k] = TAKE_FD(fds[p]);
break;
}
}
}
/* Check that all fds from both arrays are still open, and test how many in each are >= 0 */
assert_se(validate_fds(true, fds, n_fds) == n_fds - n_keep);
assert_se(validate_fds(true, keep, n_keep) == n_keep);
/* Close logging fd first, so that we don't confuse it by closing its fd */
log_close();
log_set_open_when_needed(true);
/* Close all but the ones to keep */
assert_se(close_all_fds(keep, n_keep) >= 0);
assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep);
assert_se(validate_fds(true, keep, n_keep) == n_keep);
/* Close everything else too! */
assert_se(close_all_fds(NULL, 0) >= 0);
assert_se(validate_fds(false, fds, n_fds) == n_fds - n_keep);
assert_se(validate_fds(false, keep, n_keep) == n_keep);
log_set_open_when_needed(false);
log_open();
}
int main(int argc, char *argv[]) {
test_setup_logging(LOG_DEBUG);
@ -330,6 +425,7 @@ int main(int argc, char *argv[]) {
test_rearrange_stdio();
test_fd_duplicate_data_fd();
test_read_nr_open();
test_close_all_fds();
return 0;
}