mirror of
https://github.com/systemd/systemd.git
synced 2024-12-22 17:35:35 +03:00
core: Set /proc/pid/setgroups to allow for PrivateUsers=full
When trying to run dbus-broker in a systemd unit with PrivateUsers=full, we see dbus-broker fails with EPERM at `util_audit_drop_permissions`. The root cause is dbus-broker calls the setgroups() system call and this is disallowed via systemd's implementation of PrivateUsers= by setting /proc/pid/setgroups = deny. This is done to remediate potential privilege escalation vulnerabilities in user namespaces where an attacker can remove supplementary groups and gain access to resources where those groups are restricted. However, for OS-like containers, setgroups() is a pretty common API and disabling it is not feasible. So we allow setgroups() by setting /proc/pid/setgroups to allow in PrivateUsers=full. Note security conscious users can still use SystemCallFilter= to disable setgroups() if they want to specifically prevent this system call. Fixes: #35425
This commit is contained in:
parent
705cc82938
commit
2665425176
@ -2027,8 +2027,11 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
|
||||
often a good choice if proper user namespacing with distinct UID maps is not appropriate.</para>
|
||||
|
||||
<para>If the parameter is <literal>full</literal>, user namespacing is set up with an identity
|
||||
mapping for all UIDs/GIDs. Similar to <literal>identity</literal>, this does not provide UID/GID
|
||||
isolation, but it does provide process capability isolation.</para>
|
||||
mapping for all UIDs/GIDs. In addition, for system services, <literal>full</literal> allows the unit
|
||||
to call <function>setgroups()</function> system calls (by setting
|
||||
<filename>/proc/<replaceable>pid</replaceable>/setgroups</filename> to <literal>allow</literal>).
|
||||
Similar to <literal>identity</literal>, this does not provide UID/GID isolation, but it does provide
|
||||
process capability isolation.</para>
|
||||
|
||||
<para>If this mode is enabled, all unit processes are run without privileges in the host user
|
||||
namespace (regardless if the unit's own user/group is <literal>root</literal> or not). Specifically
|
||||
|
@ -2077,7 +2077,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
|
||||
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
|
||||
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
|
||||
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
|
||||
_cleanup_close_ int unshare_ready_fd = -EBADF;
|
||||
@ -2196,7 +2196,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
|
||||
report_errno_and_exit(errno_pipe[1], -errno);
|
||||
|
||||
/* Disable the setgroups() system call in the child user namespace, for good. */
|
||||
/* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
|
||||
* and using the system service manager. */
|
||||
a = procfs_file_alloca(ppid, "setgroups");
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
@ -2207,8 +2208,9 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
|
||||
/* If the file is missing the kernel is too old, let's continue anyway. */
|
||||
} else {
|
||||
if (write(fd, "deny\n", 5) < 0) {
|
||||
r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a);
|
||||
const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
|
||||
if (write(fd, setgroups, strlen(setgroups)) < 0) {
|
||||
r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
|
||||
report_errno_and_exit(errno_pipe[1], r);
|
||||
}
|
||||
|
||||
@ -5007,7 +5009,9 @@ int exec_invoke(
|
||||
if (pu == PRIVATE_USERS_NO)
|
||||
pu = PRIVATE_USERS_SELF;
|
||||
|
||||
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid);
|
||||
/* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
|
||||
* unprivileged user namespaces. */
|
||||
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
|
||||
/* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
|
||||
* the actual requested operations fail (or silently continue). */
|
||||
if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
|
||||
@ -5177,7 +5181,8 @@ int exec_invoke(
|
||||
* different user namespace). */
|
||||
|
||||
if (needs_sandboxing && !userns_set_up) {
|
||||
r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid);
|
||||
r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid,
|
||||
/* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_USER;
|
||||
return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
|
||||
|
@ -6,9 +6,12 @@ set -o pipefail
|
||||
|
||||
systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
|
||||
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
|
||||
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"'
|
||||
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"'
|
||||
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'
|
||||
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'
|
||||
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'
|
||||
|
Loading…
Reference in New Issue
Block a user