1
0
mirror of https://github.com/systemd/systemd.git synced 2024-10-31 07:51:21 +03:00

Merge pull request #20344 from poettering/revert-close-all

Alternative to #20288 – close_all_fds() reworking
This commit is contained in:
Lennart Poettering 2021-10-27 22:02:38 +02:00 committed by GitHub
commit d5bb2b0375
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 310 additions and 116 deletions

View File

@ -187,7 +187,7 @@ _pure_ static bool fd_in_set(int fd, const int fdset[], size_t n_fdset) {
return false;
}
static int get_max_fd(void) {
int get_max_fd(void) {
struct rlimit rl;
rlim_t m;
@ -208,104 +208,17 @@ static int get_max_fd(void) {
return (int) (m - 1);
}
int close_all_fds_full(int except[], size_t n_except, bool allow_alloc) {
static bool have_close_range = true; /* Assume we live in the future */
_cleanup_closedir_ DIR *d = NULL;
int r = 0;
static int close_all_fds_frugal(const int except[], size_t n_except) {
int max_fd, r = 0;
assert(n_except == 0 || except);
if (have_close_range) {
/* In the best case we have close_range() to close all fds between a start and an end fd,
* which we can use on the "inverted" exception array, i.e. all intervals between all
* adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
* where n is number of open fds to O(mlog(m)) where m is the number of fds to keep
* open. Given that we assume n m that's preferable to us. */
/* This is the inner fallback core of close_all_fds(). This never calls malloc() or opendir() or so
* and hence is safe to be called in signal handler context. Most users should call close_all_fds(),
* but when we assume we are called from signal handler context, then use this simpler call
* instead. */
if (n_except == 0) {
/* Close everything. Yay! */
if (close_range(3, -1, 0) >= 0)
return 0;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
have_close_range = false;
else
return -errno;
} else {
typesafe_qsort(except, n_except, cmp_int);
for (size_t i = 0; i < n_except; i++) {
int start = i == 0 ? 2 : MAX(except[i-1], 2); /* The first three fds shall always remain open */
int end = MAX(except[i], 2);
assert(end >= start);
if (end - start <= 1)
continue;
/* Close everything between the start and end fds (both of which shall stay open) */
if (close_range(start + 1, end - 1, 0) < 0) {
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
have_close_range = false;
else
return -errno;
goto opendir_fallback;
}
}
/* The loop succeeded. Let's now close everything beyond the end */
if (except[n_except-1] >= INT_MAX) /* Don't let the addition below overflow */
return 0;
int start = MAX(except[n_except-1], 2);
if (close_range(start + 1, -1, 0) >= 0)
return 0;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno))
have_close_range = false;
else
return -errno;
}
}
/* Fallback for when close_range() is not supported */
opendir_fallback:
d = allow_alloc ? opendir("/proc/self/fd") : NULL;
if (d) {
struct dirent *de;
FOREACH_DIRENT(de, d, return -errno) {
int fd = -1, q;
if (safe_atoi(de->d_name, &fd) < 0)
/* Let's better ignore this, just in case */
continue;
if (fd < 3)
continue;
if (fd == dirfd(d))
continue;
if (fd_in_set(fd, except, n_except))
continue;
q = close_nointr(fd);
if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
r = q;
}
return r;
}
/* Fallback for when /proc isn't available (for example in chroots) or when we cannot allocate by
* brute-forcing through the file descriptor table. */
int max_fd = get_max_fd();
max_fd = get_max_fd();
if (max_fd < 0)
return max_fd;
@ -313,7 +226,7 @@ int close_all_fds_full(int except[], size_t n_except, bool allow_alloc) {
* spin the CPU for a long time. */
if (max_fd > MAX_FD_LOOP_LIMIT)
return log_debug_errno(SYNTHETIC_ERRNO(EPERM),
"/proc/self/fd is inaccessible. Refusing to loop over %d potential fds.",
"Refusing to loop over %d potential fds.",
max_fd);
for (int fd = 3; fd >= 0; fd = fd < max_fd ? fd + 1 : -1) {
@ -330,6 +243,179 @@ int close_all_fds_full(int except[], size_t n_except, bool allow_alloc) {
return r;
}
static bool have_close_range = true; /* Assume we live in the future */
static int close_all_fds_special_case(const int except[], size_t n_except) {
assert(n_except == 0 || except);
/* Handles a few common special cases separately, since they are common and can be optimized really
* nicely, since we won't need sorting for them. Returns > 0 if the special casing worked, 0
* otherwise. */
if (!have_close_range)
return 0;
switch (n_except) {
case 0:
/* Close everything. Yay! */
if (close_range(3, -1, 0) >= 0)
return 1;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
have_close_range = false;
return 0;
}
return -errno;
case 1:
/* Close all but exactly one, then we don't need no sorting. This is a pretty common
* case, hence let's handle it specially. */
if ((except[0] <= 3 || close_range(3, except[0]-1, 0) >= 0) &&
(except[0] >= INT_MAX || close_range(MAX(3, except[0]+1), -1, 0) >= 0))
return 1;
if (ERRNO_IS_NOT_SUPPORTED(errno) || ERRNO_IS_PRIVILEGE(errno)) {
have_close_range = false;
return 0;
}
return -errno;
default:
return 0;
}
}
int close_all_fds_without_malloc(const int except[], size_t n_except) {
int r;
assert(n_except == 0 || except);
r = close_all_fds_special_case(except, n_except);
if (r < 0)
return r;
if (r > 0) /* special case worked! */
return 0;
return close_all_fds_frugal(except, n_except);
}
int close_all_fds(const int except[], size_t n_except) {
_cleanup_closedir_ DIR *d = NULL;
struct dirent *de;
int r = 0;
assert(n_except == 0 || except);
r = close_all_fds_special_case(except, n_except);
if (r < 0)
return r;
if (r > 0) /* special case worked! */
return 0;
if (have_close_range) {
_cleanup_free_ int *sorted_malloc = NULL;
size_t n_sorted;
int *sorted;
/* In the best case we have close_range() to close all fds between a start and an end fd,
* which we can use on the "inverted" exception array, i.e. all intervals between all
* adjacent pairs from the sorted exception array. This changes loop complexity from O(n)
* where n is number of open fds to O(mlog(m)) where m is the number of fds to keep
* open. Given that we assume n m that's preferable to us. */
assert(n_except < SIZE_MAX);
n_sorted = n_except + 1;
if (n_sorted > 64) /* Use heap for large numbers of fds, stack otherwise */
sorted = sorted_malloc = new(int, n_sorted);
else
sorted = newa(int, n_sorted);
if (sorted) {
memcpy(sorted, except, n_except * sizeof(int));
/* Let's add fd 2 to the list of fds, to simplify the loop below, as this
* allows us to cover the head of the array the same way as the body */
sorted[n_sorted-1] = 2;
typesafe_qsort(sorted, n_sorted, cmp_int);
for (size_t i = 0; i < n_sorted-1; i++) {
int start, end;
start = MAX(sorted[i], 2); /* The first three fds shall always remain open */
end = MAX(sorted[i+1], 2);
assert(end >= start);
if (end - start <= 1)
continue;
/* Close everything between the start and end fds (both of which shall stay open) */
if (close_range(start + 1, end - 1, 0) < 0) {
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
have_close_range = false;
break;
}
}
if (have_close_range) {
/* The loop succeeded. Let's now close everything beyond the end */
if (sorted[n_sorted-1] >= INT_MAX) /* Dont let the addition below overflow */
return 0;
if (close_range(sorted[n_sorted-1] + 1, -1, 0) >= 0)
return 0;
if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
return -errno;
have_close_range = false;
}
}
/* Fallback on OOM or if close_range() is not supported */
}
d = opendir("/proc/self/fd");
if (!d)
return close_all_fds_frugal(except, n_except); /* ultimate fallback if /proc/ is not available */
FOREACH_DIRENT(de, d, return -errno) {
int fd = -1, q;
if (!IN_SET(de->d_type, DT_LNK, DT_UNKNOWN))
continue;
if (safe_atoi(de->d_name, &fd) < 0)
/* Let's better ignore this, just in case */
continue;
if (fd < 3)
continue;
if (fd == dirfd(d))
continue;
if (fd_in_set(fd, except, n_except))
continue;
q = close_nointr(fd);
if (q < 0 && q != -EBADF && r >= 0) /* Valgrind has its own FD and doesn't want to have it closed */
r = q;
}
return r;
}
int same_fd(int a, int b) {
struct stat sta, stb;
pid_t pid;

View File

@ -57,10 +57,10 @@ DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(DIR*, closedir, NULL);
int fd_nonblock(int fd, bool nonblock);
int fd_cloexec(int fd, bool cloexec);
int close_all_fds_full(int except[], size_t n_except, bool allow_alloc);
static inline int close_all_fds(int except[], size_t n_except) {
return close_all_fds_full(except, n_except, true);
}
int get_max_fd(void);
int close_all_fds(const int except[], size_t n_except);
int close_all_fds_without_malloc(const int except[], size_t n_except);
int same_fd(int a, int b);

View File

@ -1246,7 +1246,7 @@ static void restore_sigsetp(sigset_t **ssp) {
int safe_fork_full(
const char *name,
int except_fds[],
const int except_fds[],
size_t n_except_fds,
ForkFlags flags,
pid_t *ret_pid) {
@ -1441,7 +1441,7 @@ int safe_fork_full(
int namespace_fork(
const char *outer_name,
const char *inner_name,
int except_fds[],
const int except_fds[],
size_t n_except_fds,
ForkFlags flags,
int pidns_fd,
@ -1457,8 +1457,7 @@ int namespace_fork(
* process. This ensures that we are fully a member of the destination namespace, with pidns an all, so that
* /proc/self/fd works correctly. */
r = safe_fork_full(outer_name, except_fds, n_except_fds,
(flags|FORK_DEATHSIG) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
r = safe_fork_full(outer_name, except_fds, n_except_fds, (flags|FORK_DEATHSIG) & ~(FORK_REOPEN_LOG|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE), ret_pid);
if (r < 0)
return r;
if (r == 0) {
@ -1621,8 +1620,10 @@ bool invoked_as(char *argv[], const char *token) {
_noreturn_ void freeze(void) {
log_close();
/* Make sure nobody waits for us on a socket anymore */
(void) close_all_fds_full(NULL, 0, false);
/* Make sure nobody waits for us (i.e. on one of our sockets) anymore. Note that we use
* close_all_fds_without_malloc() instead of plain close_all_fds() here, since we want this function
* to be compatible with being called from signal handlers. */
(void) close_all_fds_without_malloc(NULL, 0);
/* Let's not freeze right away, but keep reaping zombies. */
for (;;) {

View File

@ -166,13 +166,13 @@ typedef enum ForkFlags {
FORK_NEW_USERNS = 1 << 13, /* Run child in its own user namespace */
} ForkFlags;
int safe_fork_full(const char *name, int except_fds[], size_t n_except_fds, ForkFlags flags, pid_t *ret_pid);
int safe_fork_full(const char *name, const int except_fds[], size_t n_except_fds, ForkFlags flags, pid_t *ret_pid);
static inline int safe_fork(const char *name, ForkFlags flags, pid_t *ret_pid) {
return safe_fork_full(name, NULL, 0, flags, ret_pid);
}
int namespace_fork(const char *outer_name, const char *inner_name, int except_fds[], size_t n_except_fds, ForkFlags flags, int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd, pid_t *ret_pid);
int namespace_fork(const char *outer_name, const char *inner_name, const int except_fds[], size_t n_except_fds, ForkFlags flags, int pidns_fd, int mntns_fd, int netns_fd, int userns_fd, int root_fd, pid_t *ret_pid);
int set_oom_score_adjust(int value);
int get_oom_score_adjust(int *ret);

View File

@ -471,7 +471,7 @@ int fexecve_or_execve(int executable_fd, const char *executable, char *const arg
return -errno;
}
int fork_agent(const char *name, int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) {
int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) {
bool stdout_is_tty, stderr_is_tty;
size_t n, i;
va_list ap;

View File

@ -49,4 +49,4 @@ ExecCommandFlags exec_command_flags_from_string(const char *s);
int fexecve_or_execve(int executable_fd, const char *executable, char *const argv[], char *const envp[]);
int fork_agent(const char *name, int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) _sentinel_;
int fork_agent(const char *name, const int except[], size_t n_except, pid_t *ret_pid, const char *path, ...) _sentinel_;

View File

@ -227,7 +227,9 @@ tests += [
[['src/test/test-proc-cmdline.c']],
[['src/test/test-fd-util.c']],
[['src/test/test-fd-util.c'],
[],
[libseccomp]],
[['src/test/test-web-util.c']],

View File

@ -9,10 +9,13 @@
#include "fileio.h"
#include "macro.h"
#include "memory-util.h"
#include "missing_syscall.h"
#include "mount-util.h"
#include "path-util.h"
#include "process-util.h"
#include "random-util.h"
#include "rlimit-util.h"
#include "seccomp-util.h"
#include "serialize.h"
#include "string-util.h"
#include "tests.h"
@ -213,20 +216,29 @@ static size_t validate_fds(
return c; /* Return number of fds >= 0 in the array */
}
static void test_close_all_fds(void) {
static void test_close_all_fds_inner(void) {
_cleanup_free_ int *fds = NULL, *keep = NULL;
struct rlimit rl;
size_t n_fds, n_keep;
int max_fd;
log_info("/* %s */", __func__);
rlimit_nofile_bump(-1);
assert_se(getrlimit(RLIMIT_NOFILE, &rl) >= 0);
assert_se(rl.rlim_cur > 10);
max_fd = get_max_fd();
assert_se(max_fd > 10);
if (max_fd > 7000) {
/* If the worst fallback is activated we need to iterate through all possible fds, hence,
* let's lower the limit a small bit, so that we don't run for too long. Yes, this undoes the
* rlimit_nofile_bump() call above partially. */
(void) setrlimit_closest(RLIMIT_NOFILE, &(struct rlimit) { 7000, 7000 });
max_fd = 7000;
}
/* Try to use 5000 fds, but when we can't bump the rlimit to make that happen use the whole limit minus 10 */
n_fds = MIN((rl.rlim_cur & ~1U) - 10U, 5000U);
n_fds = MIN(((size_t) max_fd & ~1U) - 10U, 5000U);
assert_se((n_fds & 1U) == 0U); /* make sure even number of fds */
/* Allocate the determined number of fds, always two at a time */
@ -278,6 +290,99 @@ static void test_close_all_fds(void) {
log_open();
}
static int seccomp_prohibit_close_range(void) {
#if defined(HAVE_SECCOMP) && defined(__SNR_close_range)
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
int r;
r = seccomp_init_for_arch(&seccomp, SCMP_ARCH_NATIVE, SCMP_ACT_ALLOW);
if (r < 0)
return log_warning_errno(r, "Failed to acquire seccomp context, ignoring: %m");
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
SCMP_SYS(close_range),
0);
if (r < 0)
return log_warning_errno(r, "Failed to add close_range() rule, ignoring: %m");
r = seccomp_load(seccomp);
if (r < 0)
return log_warning_errno(r, "Failed to apply close_range() restrictions, ignoring: %m");
return 0;
#else
return log_warning_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Seccomp support or close_range() syscall definition not availeble.");
#endif
}
static void test_close_all_fds(void) {
int r;
/* Runs the test four times. Once as is. Once with close_range() syscall blocked via seccomp, once
* with /proc overmounted, and once with the combination of both. This should trigger all fallbacks in
* the close_range_all() function. */
r = safe_fork("(caf-plain)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
if (r == 0) {
test_close_all_fds_inner();
_exit(EXIT_SUCCESS);
}
assert_se(r >= 0);
if (geteuid() != 0) {
log_notice("Lacking privileges, skipping running tests with blocked close_range() and with /proc/ overnmounted.");
return;
}
r = safe_fork("(caf-noproc)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL);
if (r == 0) {
r = mount_nofollow_verbose(LOG_WARNING, "tmpfs", "/proc", "tmpfs", 0, NULL);
if (r < 0)
log_notice("Overmounting /proc didn#t work, skipping close_all_fds() with masked /proc/.");
else
test_close_all_fds_inner();
_exit(EXIT_SUCCESS);
}
assert_se(r >= 0);
if (!is_seccomp_available()) {
log_notice("Seccomp not available, skipping seccomp tests in %s", __func__);
return;
}
r = safe_fork("(caf-seccomp)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT, NULL);
if (r == 0) {
r = seccomp_prohibit_close_range();
if (r < 0)
log_notice("Applying seccomp filter didn't work, skipping close_all_fds() test with masked close_range().");
else
test_close_all_fds_inner();
_exit(EXIT_SUCCESS);
}
assert_se(r >= 0);
r = safe_fork("(caf-scnp)", FORK_CLOSE_ALL_FDS|FORK_DEATHSIG|FORK_LOG|FORK_WAIT|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE, NULL);
if (r == 0) {
r = seccomp_prohibit_close_range();
if (r < 0)
log_notice("Applying seccomp filter didn't work, skipping close_all_fds() test with masked close_range().");
else {
r = mount_nofollow_verbose(LOG_WARNING, "tmpfs", "/proc", "tmpfs", 0, NULL);
if (r < 0)
log_notice("Overmounting /proc didn#t work, skipping close_all_fds() with masked /proc/.");
else
test_close_all_fds_inner();
}
test_close_all_fds_inner();
_exit(EXIT_SUCCESS);
}
assert_se(r >= 0);
}
static void test_format_proc_fd_path(void) {
assert_se(streq_ptr(FORMAT_PROC_FD_PATH(0), "/proc/self/fd/0"));
assert_se(streq_ptr(FORMAT_PROC_FD_PATH(1), "/proc/self/fd/1"));