From 2665425176f19ee26568151a3fabf3a117269b50 Mon Sep 17 00:00:00 2001 From: Ryan Wilson Date: Sat, 30 Nov 2024 14:14:35 -0800 Subject: [PATCH] core: Set /proc/pid/setgroups to allow for PrivateUsers=full When trying to run dbus-broker in a systemd unit with PrivateUsers=full, we see dbus-broker fails with EPERM at `util_audit_drop_permissions`. The root cause is dbus-broker calls the setgroups() system call and this is disallowed via systemd's implementation of PrivateUsers= by setting /proc/pid/setgroups = deny. This is done to remediate potential privilege escalation vulnerabilities in user namespaces where an attacker can remove supplementary groups and gain access to resources where those groups are restricted. However, for OS-like containers, setgroups() is a pretty common API and disabling it is not feasible. So we allow setgroups() by setting /proc/pid/setgroups to allow in PrivateUsers=full. Note security conscious users can still use SystemCallFilter= to disable setgroups() if they want to specifically prevent this system call. Fixes: #35425 --- man/systemd.exec.xml | 7 +++++-- src/core/exec-invoke.c | 17 +++++++++++------ test/units/TEST-07-PID1.private-users.sh | 3 +++ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 482dbbda80a..b31e64f57c8 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2027,8 +2027,11 @@ BindReadOnlyPaths=/var/lib/systemd often a good choice if proper user namespacing with distinct UID maps is not appropriate. If the parameter is full, user namespacing is set up with an identity - mapping for all UIDs/GIDs. Similar to identity, this does not provide UID/GID - isolation, but it does provide process capability isolation. + mapping for all UIDs/GIDs. In addition, for system services, full allows the unit + to call setgroups() system calls (by setting + /proc/pid/setgroups to allow). + Similar to identity, this does not provide UID/GID isolation, but it does provide + process capability isolation. If this mode is enabled, all unit processes are run without privileges in the host user namespace (regardless if the unit's own user/group is root or not). Specifically diff --git a/src/core/exec-invoke.c b/src/core/exec-invoke.c index 682d6449d76..da2d4abd3c6 100644 --- a/src/core/exec-invoke.c +++ b/src/core/exec-invoke.c @@ -2077,7 +2077,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) { return 0; } -static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) { +static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) { _cleanup_free_ char *uid_map = NULL, *gid_map = NULL; _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR; _cleanup_close_ int unshare_ready_fd = -EBADF; @@ -2196,7 +2196,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi if (read(unshare_ready_fd, &c, sizeof(c)) < 0) report_errno_and_exit(errno_pipe[1], -errno); - /* Disable the setgroups() system call in the child user namespace, for good. */ + /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full + * and using the system service manager. */ a = procfs_file_alloca(ppid, "setgroups"); fd = open(a, O_WRONLY|O_CLOEXEC); if (fd < 0) { @@ -2207,8 +2208,9 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi /* If the file is missing the kernel is too old, let's continue anyway. */ } else { - if (write(fd, "deny\n", 5) < 0) { - r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a); + const char *setgroups = allow_setgroups ? "allow\n" : "deny\n"; + if (write(fd, setgroups, strlen(setgroups)) < 0) { + r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a); report_errno_and_exit(errno_pipe[1], r); } @@ -5007,7 +5009,9 @@ int exec_invoke( if (pu == PRIVATE_USERS_NO) pu = PRIVATE_USERS_SELF; - r = setup_private_users(pu, saved_uid, saved_gid, uid, gid); + /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in + * unprivileged user namespaces. */ + r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false); /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let * the actual requested operations fail (or silently continue). */ if (r < 0 && context->private_users != PRIVATE_USERS_NO) { @@ -5177,7 +5181,8 @@ int exec_invoke( * different user namespace). */ if (needs_sandboxing && !userns_set_up) { - r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid); + r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid, + /* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL); if (r < 0) { *exit_status = EXIT_USER; return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m"); diff --git a/test/units/TEST-07-PID1.private-users.sh b/test/units/TEST-07-PID1.private-users.sh index ba85248f960..e788f52a2f7 100755 --- a/test/units/TEST-07-PID1.private-users.sh +++ b/test/units/TEST-07-PID1.private-users.sh @@ -6,9 +6,12 @@ set -o pipefail systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"' systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"' +systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"' systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"' +systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'