mirror of
https://github.com/systemd/systemd.git
synced 2025-01-09 01:18:19 +03:00
core: Add PrivateUsers=full (#35183)
Recently, PrivateUsers=identity was added to support mapping the first 65536 UIDs/GIDs from parent to the child namespace and mapping the other UID/GIDs to the nobody user. However, there are use cases where users have UIDs/GIDs > 65536 and need to do a similar identity mapping. Moreover, in some of those cases, users want a full identity mapping from 0 -> UID_MAX. To support this, we add PrivateUsers=full that does identity mapping for all available UID/GIDs. Note to differentiate ourselves from the init user namespace, we need to set up the uid_map/gid_map like: ``` 0 0 1 1 1 UINT32_MAX - 1 ``` as the init user namedspace uses `0 0 UINT32_MAX` and some applications - like systemd itself - determine if its a non-init user namespace based on uid_map/gid_map files. Note systemd will remove this heuristic in running_in_userns() in version 258 (https://github.com/systemd/systemd/pull/35382) and uses namespace inode. But some users may be running a container image with older systemd < 258 so we keep this hack until version 259 for version N-1 compatibility. In addition to mapping the whole UID/GID space, we also set /proc/pid/setgroups to "allow". While we usually set "deny" to avoid security issues with dropping supplementary groups (https://lwn.net/Articles/626665/), this ends up breaking dbus-broker when running /sbin/init in full OS containers. Fixes: #35168 Fixes: #35425
This commit is contained in:
commit
6dfd290031
@ -2009,8 +2009,8 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
|
||||
<varlistentry>
|
||||
<term><varname>PrivateUsers=</varname></term>
|
||||
|
||||
<listitem><para>Takes a boolean argument or one of <literal>self</literal> or
|
||||
<literal>identity</literal>. Defaults to false. If enabled, sets up a new user namespace for the
|
||||
<listitem><para>Takes a boolean argument or one of <literal>self</literal>, <literal>identity</literal>,
|
||||
or <literal>full</literal>. Defaults to false. If enabled, sets up a new user namespace for the
|
||||
executed processes and configures a user and group mapping. If set to a true value or
|
||||
<literal>self</literal>, a minimal user and group mapping is configured that maps the
|
||||
<literal>root</literal> user and group as well as the unit's own user and group to themselves and
|
||||
@ -2026,6 +2026,13 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
|
||||
since all UIDs/GIDs are chosen identically it does provide process capability isolation, and hence is
|
||||
often a good choice if proper user namespacing with distinct UID maps is not appropriate.</para>
|
||||
|
||||
<para>If the parameter is <literal>full</literal>, user namespacing is set up with an identity
|
||||
mapping for all UIDs/GIDs. In addition, for system services, <literal>full</literal> allows the unit
|
||||
to call <function>setgroups()</function> system calls (by setting
|
||||
<filename>/proc/<replaceable>pid</replaceable>/setgroups</filename> to <literal>allow</literal>).
|
||||
Similar to <literal>identity</literal>, this does not provide UID/GID isolation, but it does provide
|
||||
process capability isolation.</para>
|
||||
|
||||
<para>If this mode is enabled, all unit processes are run without privileges in the host user
|
||||
namespace (regardless if the unit's own user/group is <literal>root</literal> or not). Specifically
|
||||
this means that the process will have zero process capabilities on the host's user namespace, but
|
||||
|
@ -2079,7 +2079,7 @@ static int build_pass_environment(const ExecContext *c, char ***ret) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
|
||||
static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
|
||||
_cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
|
||||
_cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
|
||||
_cleanup_close_ int unshare_ready_fd = -EBADF;
|
||||
@ -2105,6 +2105,29 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
uid_map = strdup("0 0 65536\n");
|
||||
if (!uid_map)
|
||||
return -ENOMEM;
|
||||
} else if (private_users == PRIVATE_USERS_FULL) {
|
||||
/* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
|
||||
* this is the same UID/GID map as the init user namespace and systemd's running_in_userns()
|
||||
* checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`.
|
||||
* Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user
|
||||
* namespace from the init namespace:
|
||||
* 0 0 1
|
||||
* 1 1 UINT32_MAX - 1
|
||||
*
|
||||
* systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258
|
||||
* (PR #35382). But some users may be running a container image with older systemd < 258 so we keep
|
||||
* this uid_map/gid_map hack until version 259 for version N-1 compatibility.
|
||||
*
|
||||
* TODO: Switch to `0 0 UINT32_MAX` in systemd v259.
|
||||
*
|
||||
* Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
|
||||
* the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
|
||||
* icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
|
||||
* use these UIDs/GIDs so we need to map them. */
|
||||
r = asprintf(&uid_map, "0 0 1\n"
|
||||
"1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1));
|
||||
if (r < 0)
|
||||
return -ENOMEM;
|
||||
/* Can only set up multiple mappings with CAP_SETUID. */
|
||||
} else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) {
|
||||
r = asprintf(&uid_map,
|
||||
@ -2125,6 +2148,11 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
gid_map = strdup("0 0 65536\n");
|
||||
if (!gid_map)
|
||||
return -ENOMEM;
|
||||
} else if (private_users == PRIVATE_USERS_FULL) {
|
||||
r = asprintf(&gid_map, "0 0 1\n"
|
||||
"1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1));
|
||||
if (r < 0)
|
||||
return -ENOMEM;
|
||||
/* Can only set up multiple mappings with CAP_SETGID. */
|
||||
} else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) {
|
||||
r = asprintf(&gid_map,
|
||||
@ -2170,7 +2198,8 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
|
||||
report_errno_and_exit(errno_pipe[1], -errno);
|
||||
|
||||
/* Disable the setgroups() system call in the child user namespace, for good. */
|
||||
/* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
|
||||
* and using the system service manager. */
|
||||
a = procfs_file_alloca(ppid, "setgroups");
|
||||
fd = open(a, O_WRONLY|O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
@ -2181,8 +2210,9 @@ static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogi
|
||||
|
||||
/* If the file is missing the kernel is too old, let's continue anyway. */
|
||||
} else {
|
||||
if (write(fd, "deny\n", 5) < 0) {
|
||||
r = log_debug_errno(errno, "Failed to write \"deny\" to %s: %m", a);
|
||||
const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
|
||||
if (write(fd, setgroups, strlen(setgroups)) < 0) {
|
||||
r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
|
||||
report_errno_and_exit(errno_pipe[1], r);
|
||||
}
|
||||
|
||||
@ -4984,7 +5014,9 @@ int exec_invoke(
|
||||
if (pu == PRIVATE_USERS_NO)
|
||||
pu = PRIVATE_USERS_SELF;
|
||||
|
||||
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid);
|
||||
/* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
|
||||
* unprivileged user namespaces. */
|
||||
r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
|
||||
/* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
|
||||
* the actual requested operations fail (or silently continue). */
|
||||
if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
|
||||
@ -5154,7 +5186,8 @@ int exec_invoke(
|
||||
* different user namespace). */
|
||||
|
||||
if (needs_sandboxing && !userns_set_up) {
|
||||
r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid);
|
||||
r = setup_private_users(context->private_users, saved_uid, saved_gid, uid, gid,
|
||||
/* allow_setgroups= */ context->private_users == PRIVATE_USERS_FULL);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_USER;
|
||||
return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
|
||||
|
@ -3380,6 +3380,7 @@ static const char* const private_users_table[_PRIVATE_USERS_MAX] = {
|
||||
[PRIVATE_USERS_NO] = "no",
|
||||
[PRIVATE_USERS_SELF] = "self",
|
||||
[PRIVATE_USERS_IDENTITY] = "identity",
|
||||
[PRIVATE_USERS_FULL] = "full",
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(private_users, PrivateUsers, PRIVATE_USERS_SELF);
|
||||
|
@ -73,6 +73,7 @@ typedef enum PrivateUsers {
|
||||
PRIVATE_USERS_NO,
|
||||
PRIVATE_USERS_SELF,
|
||||
PRIVATE_USERS_IDENTITY,
|
||||
PRIVATE_USERS_FULL,
|
||||
_PRIVATE_USERS_MAX,
|
||||
_PRIVATE_USERS_INVALID = -EINVAL,
|
||||
} PrivateUsers;
|
||||
|
@ -6,7 +6,12 @@ set -o pipefail
|
||||
|
||||
systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsers=yes --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsersEx=yes --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
|
||||
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 1"'
|
||||
systemd-run -p PrivateUsersEx=self --wait bash -c 'test "$(cat /proc/self/setgroups)" == "deny"'
|
||||
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/uid_map)" == " 0 0 65536"'
|
||||
systemd-run -p PrivateUsersEx=identity --wait bash -c 'test "$(cat /proc/self/gid_map)" == " 0 0 65536"'
|
||||
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/uid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'
|
||||
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/gid_map | tr -d "\n")" == " 0 0 1 1 1 4294967294"'
|
||||
systemd-run -p PrivateUsersEx=full --wait bash -c 'test "$(cat /proc/self/setgroups)" == "allow"'
|
||||
|
Loading…
Reference in New Issue
Block a user