1
0
mirror of https://github.com/systemd/systemd.git synced 2024-12-22 17:35:35 +03:00

core: Set /proc/pid/setgroups to allow for PrivateUsers=full

When trying to run dbus-broker in a systemd unit with PrivateUsers=full,
we see dbus-broker fails with EPERM at `util_audit_drop_permissions`.

The root cause is dbus-broker calls the setgroups() system call and this
is disallowed via systemd's implementation of PrivateUsers= by setting
/proc/pid/setgroups = deny. This is done to remediate potential privilege
escalation vulnerabilities in user namespaces where an attacker can remove
supplementary groups and gain access to resources where those groups are
restricted.

However, for OS-like containers, setgroups() is a pretty common API and
disabling it is not feasible. So we allow setgroups() by setting
/proc/pid/setgroups to allow in PrivateUsers=full. Note security conscious
users can still use SystemCallFilter= to disable setgroups() if they want
to specifically prevent this system call.

Fixes: #35425
This commit is contained in:
Ryan Wilson 2024-11-30 14:14:35 -08:00 committed by Daan De Meyer
parent 705cc82938
commit 2665425176
3 changed files with 19 additions and 8 deletions

View File

@ -2027,8 +2027,11 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
often a good choice if proper user namespacing with distinct UID maps is not appropriate.</para>
<para>If the parameter is <literal>full</literal>, user namespacing is set up with an identity
mapping for all UIDs/GIDs. Similar to <literal>identity</literal>, this does not provide UID/GID
isolation, but it does provide process capability isolation.</para>
mapping for all UIDs/GIDs. In addition, for system services, <literal>full</literal> allows the unit
to call <function>setgroups()</function> system calls (by setting
<filename>/proc/<replaceable>pid</replaceable>/setgroups</filename> to <literal>allow</literal>).
Similar to <literal>identity</literal>, this does not provide UID/GID isolation, but it does provide
process capability isolation.</para>
<para>If this mode is enabled, all unit processes are run without privileges in the host user
namespace (regardless if the unit's own user/group is <literal>root</literal> or not). Specifically

View File

@ -2077,7 +2077,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) {
return 0;
}
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
_cleanup_close_ int unshare_ready_fd = -EBADF;
@ -2196,7 +2196,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
report_errno_and_exit(errno_pipe[1], -errno);
/* Disable the setgroups() system call in the child user namespace, for good. */
/* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
* and using the system service manager. */
a = procfs_file_alloca(ppid, "setgroups");
fd = open(a, O_WRONLY|O_CLOEXEC);
if (fd < 0) {
@ -2207,8 +2208,9 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
/* If the file is missing the kernel is too old, let's continue anyway. */
} else {
if (write(fd, "deny\n", 5) < 0) {
r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a);
const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
if (write(fd, setgroups, strlen(setgroups)) < 0) {
r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
report_errno_and_exit(errno_pipe[1], r);
}
@ -5007,7 +5009,9 @@ int exec_invoke(
if (pu == PRIVATE_USERS_NO)
pu = PRIVATE_USERS_SELF;
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid);
/* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
* unprivileged user namespaces. */
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
/* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
* the actual requested operations fail (or silently continue). */
if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
@ -5177,7 +5181,8 @@ int exec_invoke(
* different user namespace). */
if (needs_sandboxing && !userns_set_up) {
r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid);
r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid,
/* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL);
if (r < 0) {
*exit_status = EXIT_USER;
return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");

View File

@ -6,9 +6,12 @@ set -o pipefail
systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"'
systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"'
systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"'
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"'
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"'
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"'
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'