mirror of
https://github.com/systemd/systemd.git
synced 2025-01-03 05:18:09 +03:00
core: add support for pidfd_spawn
Added in glibc 2.39, allows cloning into a cgroup and to get a pid fd back instead of a pid. Removes race conditions for both changing cgroups and getting a reliable reference for the child process. Fixes https://github.com/systemd/systemd/pull/18843 Replaces https://github.com/systemd/systemd/pull/16706
This commit is contained in:
parent
9ca13d60db
commit
2e106312e2
@ -640,6 +640,7 @@ foreach ident : [
|
|||||||
['fsconfig', '''#include <sys/mount.h>'''],
|
['fsconfig', '''#include <sys/mount.h>'''],
|
||||||
['fsmount', '''#include <sys/mount.h>'''],
|
['fsmount', '''#include <sys/mount.h>'''],
|
||||||
['getdents64', '''#include <dirent.h>'''],
|
['getdents64', '''#include <dirent.h>'''],
|
||||||
|
['pidfd_spawn', '''#include <spawn.h>'''],
|
||||||
]
|
]
|
||||||
|
|
||||||
have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
|
have = cc.has_function(ident[0], prefix : ident[1], args : '-D_GNU_SOURCE')
|
||||||
|
@ -2034,10 +2034,12 @@ int make_reaper_process(bool b) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, PidRef *ret_pidref) {
|
DEFINE_TRIVIAL_CLEANUP_FUNC_FULL(posix_spawnattr_t*, posix_spawnattr_destroy, NULL);
|
||||||
|
|
||||||
|
int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, const char *cgroup, PidRef *ret_pidref) {
|
||||||
|
short flags = POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF;
|
||||||
posix_spawnattr_t attr;
|
posix_spawnattr_t attr;
|
||||||
sigset_t mask;
|
sigset_t mask;
|
||||||
pid_t pid;
|
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
/* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
|
/* Forks and invokes 'path' with 'argv' and 'envp' using CLONE_VM and CLONE_VFORK, which means the
|
||||||
@ -2054,26 +2056,78 @@ int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp,
|
|||||||
r = posix_spawnattr_init(&attr);
|
r = posix_spawnattr_init(&attr);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
return -r; /* These functions return a positive errno on failure */
|
return -r; /* These functions return a positive errno on failure */
|
||||||
/* Set all signals to SIG_DFL */
|
|
||||||
r = posix_spawnattr_setflags(&attr, POSIX_SPAWN_SETSIGMASK|POSIX_SPAWN_SETSIGDEF);
|
/* Initialization needs to succeed before we can set up a destructor. */
|
||||||
|
_unused_ _cleanup_(posix_spawnattr_destroyp) posix_spawnattr_t *attr_destructor = &attr;
|
||||||
|
|
||||||
|
#if HAVE_PIDFD_SPAWN
|
||||||
|
_cleanup_close_ int cgroup_fd = -EBADF;
|
||||||
|
|
||||||
|
if (cgroup) {
|
||||||
|
_cleanup_free_ char *resolved_cgroup = NULL;
|
||||||
|
|
||||||
|
r = cg_get_path_and_check(
|
||||||
|
SYSTEMD_CGROUP_CONTROLLER,
|
||||||
|
cgroup,
|
||||||
|
/* suffix= */ NULL,
|
||||||
|
&resolved_cgroup);
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
|
||||||
|
cgroup_fd = open(resolved_cgroup, O_PATH|O_DIRECTORY|O_CLOEXEC);
|
||||||
|
if (cgroup_fd < 0)
|
||||||
|
return -errno;
|
||||||
|
|
||||||
|
r = posix_spawnattr_setcgroup_np(&attr, cgroup_fd);
|
||||||
|
if (r != 0)
|
||||||
|
return -r;
|
||||||
|
|
||||||
|
flags |= POSIX_SPAWN_SETCGROUP;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
r = posix_spawnattr_setflags(&attr, flags);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
goto fail;
|
return -r;
|
||||||
r = posix_spawnattr_setsigmask(&attr, &mask);
|
r = posix_spawnattr_setsigmask(&attr, &mask);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
goto fail;
|
return -r;
|
||||||
|
|
||||||
|
#if HAVE_PIDFD_SPAWN
|
||||||
|
_cleanup_close_ int pidfd = -EBADF;
|
||||||
|
|
||||||
|
r = pidfd_spawn(&pidfd, path, NULL, &attr, argv, envp);
|
||||||
|
if (r == 0) {
|
||||||
|
r = pidref_set_pidfd_consume(ret_pidref, TAKE_FD(pidfd));
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
|
||||||
|
if (cgroup_fd == -EBADF)
|
||||||
|
return 0; /* We managed to use the new API but we are running under cgroupv1 */
|
||||||
|
|
||||||
|
return 1; /* We managed to use the new API and we are already in the new cgroup */
|
||||||
|
}
|
||||||
|
if (!(ERRNO_IS_NOT_SUPPORTED(r) || ERRNO_IS_PRIVILEGE(r)))
|
||||||
|
return -r;
|
||||||
|
|
||||||
|
/* Compiled on a newer host, or seccomp&friends blocking clone3()? Fallback, but need to change the
|
||||||
|
* flags to remove the cgroup one, which is what redirects to clone3() */
|
||||||
|
flags &= ~POSIX_SPAWN_SETCGROUP;
|
||||||
|
r = posix_spawnattr_setflags(&attr, flags);
|
||||||
|
if (r != 0)
|
||||||
|
return -r;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
pid_t pid;
|
||||||
r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
|
r = posix_spawn(&pid, path, NULL, &attr, argv, envp);
|
||||||
if (r != 0)
|
if (r != 0)
|
||||||
goto fail;
|
return -r;
|
||||||
|
|
||||||
posix_spawnattr_destroy(&attr);
|
r = pidref_set_pid(ret_pidref, pid);
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
|
||||||
return pidref_set_pid(ret_pidref, pid);
|
return 0; /* We did not use CLONE_INTO_CGROUP so return 0, the caller will have to move the child */
|
||||||
|
|
||||||
fail:
|
|
||||||
assert(r > 0);
|
|
||||||
posix_spawnattr_destroy(&attr);
|
|
||||||
return -r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int proc_dir_open(DIR **ret) {
|
int proc_dir_open(DIR **ret) {
|
||||||
|
@ -238,7 +238,7 @@ int get_process_threads(pid_t pid);
|
|||||||
int is_reaper_process(void);
|
int is_reaper_process(void);
|
||||||
int make_reaper_process(bool b);
|
int make_reaper_process(bool b);
|
||||||
|
|
||||||
int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, PidRef *ret_pidref);
|
int posix_spawn_wrapper(const char *path, char *const *argv, char *const *envp, const char *cgroup, PidRef *ret_pidref);
|
||||||
|
|
||||||
int proc_dir_open(DIR **ret);
|
int proc_dir_open(DIR **ret);
|
||||||
int proc_dir_read(DIR *d, pid_t *ret);
|
int proc_dir_read(DIR *d, pid_t *ret);
|
||||||
|
@ -364,6 +364,7 @@ int exec_spawn(Unit *unit,
|
|||||||
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
|
_cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
|
||||||
_cleanup_fdset_free_ FDSet *fdset = NULL;
|
_cleanup_fdset_free_ FDSet *fdset = NULL;
|
||||||
_cleanup_fclose_ FILE *f = NULL;
|
_cleanup_fclose_ FILE *f = NULL;
|
||||||
|
bool move_child = true;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
assert(unit);
|
assert(unit);
|
||||||
@ -404,8 +405,8 @@ int exec_spawn(Unit *unit,
|
|||||||
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
|
* child's memory.max, serialize all the state needed to start the unit, and pass it to the
|
||||||
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
|
* systemd-executor binary. clone() with CLONE_VM + CLONE_VFORK will pause the parent until the exec
|
||||||
* and ensure all memory is shared. The child immediately execs the new binary so the delay should
|
* and ensure all memory is shared. The child immediately execs the new binary so the delay should
|
||||||
* be minimal. Once glibc provides a clone3 wrapper we can switch to that, and clone directly in the
|
* be minimal. If glibc 2.39 is available pidfd_spawn() is used in order to get a race-free pid fd
|
||||||
* target cgroup. */
|
* and to clone directly into the target cgroup (if we booted with cgroupv2). */
|
||||||
|
|
||||||
r = open_serialization_file("sd-executor-state", &f);
|
r = open_serialization_file("sd-executor-state", &f);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
@ -451,16 +452,19 @@ int exec_spawn(Unit *unit,
|
|||||||
"--log-level", log_level,
|
"--log-level", log_level,
|
||||||
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
|
"--log-target", log_target_to_string(manager_get_executor_log_target(unit->manager))),
|
||||||
environ,
|
environ,
|
||||||
|
cg_unified() > 0 ? subcgroup_path : NULL,
|
||||||
&pidref);
|
&pidref);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
|
return log_unit_error_errno(unit, r, "Failed to spawn executor: %m");
|
||||||
|
if (r > 0)
|
||||||
|
move_child = false; /* Already in the right cgroup thanks to CLONE_INTO_CGROUP */
|
||||||
|
|
||||||
log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pidref.pid);
|
log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pidref.pid);
|
||||||
|
|
||||||
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
|
/* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
|
||||||
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
|
* executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
|
||||||
* process will be killed too). */
|
* process will be killed too). */
|
||||||
if (subcgroup_path)
|
if (move_child && subcgroup_path)
|
||||||
(void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pidref.pid);
|
(void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pidref.pid);
|
||||||
|
|
||||||
exec_status_start(&command->exec_status, pidref.pid);
|
exec_status_start(&command->exec_status, pidref.pid);
|
||||||
|
Loading…
Reference in New Issue
Block a user