From 2e106312e2e8e8f88516039446abcf7d9afdaff2 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Fri, 12 Jan 2024 21:32:20 +0000 Subject: [PATCH] core: add support for pidfd_spawn Added in glibc 2.39, allows cloning into a cgroup and to get a pid fd back instead of a pid. Removes race conditions for both changing cgroups and getting a reliable reference for the child process. Fixes https://github.com/systemd/systemd/pull/18843 Replaces https://github.com/systemd/systemd/pull/16706 --- meson.build | 1 + src/basic/process-util.c | 82 +++++++++++++++++++++++++++++++++------- src/basic/process-util.h | 2 +- src/core/execute.c | 10 +++-- 4 files changed, 77 insertions(+), 18 deletions(-) diff --git a/meson.build b/meson.build index 1001c114a42..4ebe46dd234 100644 --- a/meson.build +++ b/meson.build @@ -640,6 +640,7 @@ foreach ident : [ ['fsconfig', '''#include '''], ['fsmount', '''#include '''], ['getdents64', '''#include '''], + ['pidfd_spawn', '''#include '''], ] have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE') diff --git a/src/basic/process-util.c b/src/basic/process-util.c index 701eea673c5..ebf3344bf1b 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -2034,10 +2034,12 @@ int make_reaper_process(bool b) { return 0; } -int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, PidRef *ret_pidref) { +DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL); + +int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, const char *cgroup, PidRef *ret_pidref) { + short flags = POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF; posix_spawnattr_t attr; sigset_t mask; - pid_t pid; int r; /* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the @@ -2054,26 +2056,78 @@ int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, r = posix_spawnattr_init(&attr); if (r != 0) return -r; /* These functions return a positive errno on failure */ - /* Set all signals to SIG_DFL */ - r = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF); + + /* Initialization needs to succeed before we can set up a destructor. */ + _unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr; + +#if HAVE_PIDFD_SPAWN + _cleanup_close_ int cgroup_fd = -EBADF; + + if (cgroup) { + _cleanup_free_ char *resolved_cgroup = NULL; + + r = cg_get_path_and_check( + SYSTEMD_CGROUP_CONTROLLER, + cgroup, + /* suffix= */ NULL, + &resolved_cgroup); + if (r < 0) + return r; + + cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC); + if (cgroup_fd < 0) + return -errno; + + r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd); + if (r != 0) + return -r; + + flags |= POSIX_SPAWN_SETCGROUP; + } +#endif + + r = posix_spawnattr_setflags(&attr, flags); if (r != 0) - goto fail; + return -r; r = posix_spawnattr_setsigmask(&attr, &mask); if (r != 0) - goto fail; + return -r; +#if HAVE_PIDFD_SPAWN + _cleanup_close_ int pidfd = -EBADF; + + r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp); + if (r == 0) { + r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd)); + if (r < 0) + return r; + + if (cgroup_fd == -EBADF) + return 0; /* We managed to use the new API but we are running under cgroupv1 */ + + return 1; /* We managed to use the new API and we are already in the new cgroup */ + } + if (!(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r))) + return -r; + + /* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but need to change the + * flags to remove the cgroup one, which is what redirects to clone3() */ + flags &= ~POSIX_SPAWN_SETCGROUP; + r = posix_spawnattr_setflags(&attr, flags); + if (r != 0) + return -r; +#endif + + pid_t pid; r = posix_spawn(&pid, path, NULL, &attr, argv, envp); if (r != 0) - goto fail; + return -r; - posix_spawnattr_destroy(&attr); + r = pidref_set_pid(ret_pidref, pid); + if (r < 0) + return r; - return pidref_set_pid(ret_pidref, pid); - -fail: - assert(r > 0); - posix_spawnattr_destroy(&attr); - return -r; + return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */ } int proc_dir_open(DIR **ret) { diff --git a/src/basic/process-util.h b/src/basic/process-util.h index bfa43318956..06c7f0a7d86 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -238,7 +238,7 @@ int get_process_threads(pid_t pid); int is_reaper_process(void); int make_reaper_process(bool b); -int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, PidRef *ret_pidref); +int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, const char *cgroup, PidRef *ret_pidref); int proc_dir_open(DIR **ret); int proc_dir_read(DIR *d, pid_t *ret); diff --git a/src/core/execute.c b/src/core/execute.c index c9c07714a3b..e5062e1b466 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -364,6 +364,7 @@ int exec_spawn(Unit *unit, _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL; _cleanup_fdset_free_ FDSet *fdset = NULL; _cleanup_fclose_ FILE *f = NULL; + bool move_child = true; int r; assert(unit); @@ -404,8 +405,8 @@ int exec_spawn(Unit *unit, * child's memory.max, serialize all the state needed to start the unit, and pass it to the * systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec * and ensure all memory is shared. The child immediately execs the new binary so the delay should - * be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the - * target cgroup. */ + * be minimal. If glibc 2.39 is available pidfd_spawn() is used in order to get a race-free pid fd + * and to clone directly into the target cgroup (if we booted with cgroupv2). */ r = open_serialization_file("sd-executor-state", &f); if (r < 0) @@ -451,16 +452,19 @@ int exec_spawn(Unit *unit, "--log-level", log_level, "--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))), environ, + cg_unified() > 0 ? subcgroup_path : NULL, &pidref); if (r < 0) return log_unit_error_errno(unit, r, "Failed to spawn executor: %m"); + if (r > 0) + move_child = false; /* Already in the right cgroup thanks to CLONE_INTO_CGROUP */ log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pidref.pid); /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the * process will be killed too). */ - if (subcgroup_path) + if (move_child && subcgroup_path) (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pidref.pid); exec_status_start(&command->exec_status, pidref.pid);