1
0
mirror of https://github.com/systemd/systemd.git synced 2024-12-23 21:35:11 +03:00

Merge pull request #3093 from poettering/nspawn-userns-magic

nspawn automatic user namespaces
This commit is contained in:
Lennart Poettering 2016-04-26 14:57:04 +02:00
commit d7fe83bbc2
17 changed files with 1088 additions and 72 deletions

1
.gitignore vendored
View File

@ -240,6 +240,7 @@
/test-ns /test-ns
/test-nss /test-nss
/test-parse-util /test-parse-util
/test-patch-uid
/test-path /test-path
/test-path-lookup /test-path-lookup
/test-path-util /test-path-util

View File

@ -3021,6 +3021,8 @@ systemd_nspawn_SOURCES = \
src/nspawn/nspawn-setuid.h \ src/nspawn/nspawn-setuid.h \
src/nspawn/nspawn-stub-pid1.c \ src/nspawn/nspawn-stub-pid1.c \
src/nspawn/nspawn-stub-pid1.h \ src/nspawn/nspawn-stub-pid1.h \
src/nspawn/nspawn-patch-uid.c \
src/nspawn/nspawn-patch-uid.h \
src/core/mount-setup.c \ src/core/mount-setup.c \
src/core/mount-setup.h \ src/core/mount-setup.h \
src/core/loopback-setup.c \ src/core/loopback-setup.c \
@ -3048,6 +3050,17 @@ systemd_nspawn_LDADD += \
libfirewall.la libfirewall.la
endif endif
test_patch_uid_SOURCES = \
src/nspawn/nspawn-patch-uid.c \
src/nspawn/nspawn-patch-uid.h \
src/nspawn/test-patch-uid.c
test_patch_uid_LDADD = \
libshared.la
manual_tests += \
test-patch-uid
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
systemd_run_SOURCES = \ systemd_run_SOURCES = \
src/run/run.c src/run/run.c

View File

@ -387,38 +387,79 @@
<varlistentry> <varlistentry>
<term><option>--private-users=</option></term> <term><option>--private-users=</option></term>
<listitem><para>Enables user namespacing. If enabled, the <listitem><para>Controls user namespacing. If enabled, the container will run with its own private set of UNIX
container will run with its own private set of Unix user and user and group ids (UIDs and GIDs). This involves mapping the private UIDs/GIDs used in the container (starting
group ids (UIDs and GIDs). Takes none, one or two with the container's root user 0 and up) to a range of UIDs/GIDs on the host that are not used for other
colon-separated parameters: the first parameter specifies the purposes (usually in the range beyond the host's UID/GID 65536). The parameter may be specified as follows:</para>
first host UID to assign to the container, the second
parameter specifies the number of host UIDs to assign to the
container. If the second parameter is omitted, 65536 UIDs are
assigned. If the first parameter is also omitted (and hence
no parameter passed at all), the first UID assigned to the
container is read from the owner of the root directory of the
container's directory tree. By default, no user namespacing is
applied.</para>
<para>Note that user namespacing currently requires OS trees <orderedlist>
that are prepared for the UID shift that is being applied: <listitem><para>The value <literal>no</literal> turns off user namespacing. This is the default.</para></listitem>
UIDs and GIDs used for file ownership or in file ACL entries
must be shifted to the container UID base that is
used during container runtime.</para>
<para>It is recommended to assign at least 65536 UIDs to each <listitem><para>The value <literal>yes</literal> (or the omission of a parameter) turns on user
container, so that the usable UID range in the container namespacing. The UID/GID range to use is determined automatically from the file ownership of the root
covers 16 bit. For best security, do not assign overlapping UID directory of the container's directory tree. To use this option, make sure to prepare the directory tree in
ranges to multiple containers. It is hence a good idea to use advance, and ensure that all files and directories in it are owned by UIDs/GIDs in the range you'd like to
the upper 16 bit of the host 32-bit UIDs as container use. Also, make sure that used file ACLs exclusively reference UIDs/GIDs in the appropriate range. If this
identifier, while the lower 16 bit encode the container UID mode is used the number of UIDs/GIDs assigned to the container for use is 65536, and the UID/GID of the
used.</para> root directory must be a multiple of 65536.</para></listitem>
<para>When user namespaces are used, the GID range assigned to <listitem><para>The value "pick" turns on user namespacing. In this case the UID/GID range is automatically
each container is always chosen identical to the UID chosen. As first step, the file owner of the root directory of the container's directory tree is read, and it
range.</para></listitem> is checked that it is currently not used by the system otherwise (in particular, that no other container is
using it). If this check is successful, the UID/GID range determined this way is used, similar to the
behaviour if "yes" is specified. If the check is not successful (and thus the UID/GID range indicated in the
root directory's file owner is already used elsewhere) a new currently unused UID/GID range of 65536
UIDs/GIDs is randomly chosen between the host UID/GIDs of 524288 and 1878982656, always starting at a
multiple of 65536. This setting implies <option>--private-users-chown</option> (see below), which has the
effect that the files and directories in the container's directory tree will be owned by the appropriate
users of the range picked. Using this option makes user namespace behaviour fully automatic. Note that the
first invocation of a previously unused container image might result in picking a new UID/GID range for it,
and thus in the (possibly expensive) file ownership adjustment operation. However, subsequent invocations of
the container will be cheap (unless of course the picked UID/GID range is assigned to a different use by
then).</para></listitem>
<listitem><para>Finally if one or two colon-separated numeric parameters are specified, user namespacing is
turned on, too. The first parameter specifies the first host UID/GID to assign to the container, the second
parameter specifies the number of host UIDs/GIDs to assign to the container. If the second parameter is
omitted, 65536 UIDs/GIDs are assigned.</para></listitem>
</orderedlist>
<para>It is recommended to assign at least 65536 UIDs/GIDs to each container, so that the usable UID/GID range in the
container covers 16 bit. For best security, do not assign overlapping UID/GID ranges to multiple containers. It is
hence a good idea to use the upper 16 bit of the host 32-bit UIDs/GIDs as container identifier, while the lower 16
bit encode the container UID/GID used. This is in fact the behaviour enforced by the
<option>--private-users=pick</option> option.</para>
<para>When user namespaces are used, the GID range assigned to each container is always chosen identical to the
UID range.</para>
<para>In most cases, using <option>--private-users=pick</option> is the recommended option as it enhances
container security massively and operates fully automatically in most cases.</para>
<para>Note that the picked UID/GID range is not written to <filename>/etc/passwd</filename> or
<filename>/etc/group</filename>. In fact, the allocation of the range is not stored persistently anywhere,
except in the file ownership of the files and directories of the container.</para></listitem>
</varlistentry> </varlistentry>
<varlistentry>
<term><option>-U</option></term>
<listitem><para>If the kernel supports the user namespaces feature, equivalent to
<option>--private-users=pick</option>, otherwise equivalent to
<option>--private-users=no</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--private-users-chown</option></term>
<listitem><para>If specified, all files and directories in the container's directory tree will adjusted so that
they are owned to the appropriate UIDs/GIDs selected for the container (see above). This operation is
potentially expensive, as it involves descending and iterating through the full directory tree of the
container. Besides actual file ownership, file ACLs are adjusted as well.</para>
<para>This option is implied if <option>--private-users=pick</option> is used. This option has no effect if
user namespacing is not used.</para></listitem>
</varlistentry>
<varlistentry> <varlistentry>
<term><option>--private-network</option></term> <term><option>--private-network</option></term>

View File

@ -251,6 +251,14 @@
<option>--uuid=</option> command line switch. This option is <option>--uuid=</option> command line switch. This option is
privileged (see above). </para></listitem> privileged (see above). </para></listitem>
</varlistentry> </varlistentry>
<varlistentry>
<term><varname>PrivateUsers=</varname></term>
<listitem><para>Configures support for usernamespacing. This is equivalent to the
<option>--private-users=</option> command line switch, and takes the same options. This option is privileged
(see above). </para></listitem>
</varlistentry>
</variablelist> </variablelist>
</refsect1> </refsect1>
@ -314,6 +322,16 @@
for details about the specific options supported. This setting for details about the specific options supported. This setting
is privileged (see above).</para></listitem> is privileged (see above).</para></listitem>
</varlistentry> </varlistentry>
<varlistentry>
<term><varname>PrivateUsersChown=</varname></term>
<listitem><para>Configures whether the ownership of the files and directories in the container tree shall be
adjusted to the UID/GID range used, if necessary and user namespacing is enabled. This is equivalent to the
<option>--private-users-chown</option> command line switch. This option is privileged (see
above). </para></listitem>
</varlistentry>
</variablelist> </variablelist>
</refsect1> </refsect1>

View File

@ -102,7 +102,7 @@ int copy_bytes(int fdf, int fdt, uint64_t max_bytes, bool try_reflink) {
if (try_cfr) { if (try_cfr) {
n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u); n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u);
if (n < 0) { if (n < 0) {
if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV)) if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF))
return n; return n;
try_cfr = false; try_cfr = false;

View File

@ -25,11 +25,13 @@
#include <unistd.h> #include <unistd.h>
#include "fd-util.h" #include "fd-util.h"
#include "fs-util.h"
#include "macro.h" #include "macro.h"
#include "missing.h" #include "missing.h"
#include "parse-util.h" #include "parse-util.h"
#include "path-util.h" #include "path-util.h"
#include "socket-util.h" #include "socket-util.h"
#include "stdio-util.h"
#include "util.h" #include "util.h"
int close_nointr(int fd) { int close_nointr(int fd) {
@ -356,3 +358,11 @@ bool fdname_is_valid(const char *s) {
return p - s < 256; return p - s < 256;
} }
int fd_get_path(int fd, char **ret) {
char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
xsprintf(procfs_path, "/proc/self/fd/%i", fd);
return readlink_malloc(procfs_path, ret);
}

View File

@ -72,6 +72,8 @@ void cmsg_close_all(struct msghdr *mh);
bool fdname_is_valid(const char *s); bool fdname_is_valid(const char *s);
int fd_get_path(int fd, char **ret);
/* Hint: ENETUNREACH happens if we try to connect to "non-existing" special IP addresses, such as ::5 */ /* Hint: ENETUNREACH happens if we try to connect to "non-existing" special IP addresses, such as ::5 */
#define ERRNO_IS_DISCONNECT(r) \ #define ERRNO_IS_DISCONNECT(r) \
IN_SET(r, ENOTCONN, ECONNRESET, ECONNREFUSED, ECONNABORTED, EPIPE, ENETUNREACH) IN_SET(r, ENOTCONN, ECONNRESET, ECONNREFUSED, ECONNABORTED, EPIPE, ENETUNREACH)

View File

@ -445,6 +445,10 @@ struct btrfs_ioctl_quota_ctl_args {
#define TMPFS_MAGIC 0x01021994 #define TMPFS_MAGIC 0x01021994
#endif #endif
#ifndef MQUEUE_MAGIC
#define MQUEUE_MAGIC 0x19800202
#endif
#ifndef MS_MOVE #ifndef MS_MOVE
#define MS_MOVE 8192 #define MS_MOVE 8192
#endif #endif

View File

@ -21,6 +21,7 @@
#include <stdbool.h> #include <stdbool.h>
#include <sys/types.h> #include <sys/types.h>
#include <unistd.h>
bool uid_is_valid(uid_t uid); bool uid_is_valid(uid_t uid);
@ -63,3 +64,7 @@ int take_etc_passwd_lock(const char *root);
#define PTR_TO_GID(p) ((gid_t) (((uintptr_t) (p))-1)) #define PTR_TO_GID(p) ((gid_t) (((uintptr_t) (p))-1))
#define GID_TO_PTR(u) ((void*) (((uintptr_t) (u))+1)) #define GID_TO_PTR(u) ((void*) (((uintptr_t) (u))+1))
static inline bool userns_supported(void) {
return access("/proc/self/uid_map", F_OK) >= 0;
}

View File

@ -16,7 +16,7 @@ struct ConfigPerfItem;
%includes %includes
%% %%
Exec.Boot, config_parse_boot, 0, 0 Exec.Boot, config_parse_boot, 0, 0
Exec.ProcessTwo, config_parse_pid2, 0, 0, Exec.ProcessTwo, config_parse_pid2, 0, 0
Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters) Exec.Parameters, config_parse_strv, 0, offsetof(Settings, parameters)
Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment) Exec.Environment, config_parse_strv, 0, offsetof(Settings, environment)
Exec.User, config_parse_string, 0, offsetof(Settings, user) Exec.User, config_parse_string, 0, offsetof(Settings, user)
@ -26,11 +26,13 @@ Exec.KillSignal, config_parse_signal, 0, offsetof(Settings,
Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality) Exec.Personality, config_parse_personality, 0, offsetof(Settings, personality)
Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id) Exec.MachineID, config_parse_id128, 0, offsetof(Settings, machine_id)
Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory) Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings, working_directory)
Exec.PrivateUsers, config_parse_private_users, 0, 0
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only) Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode) Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
Files.Bind, config_parse_bind, 0, 0 Files.Bind, config_parse_bind, 0, 0
Files.BindReadOnly, config_parse_bind, 1, 0 Files.BindReadOnly, config_parse_bind, 1, 0
Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0 Files.TemporaryFileSystem, config_parse_tmpfs, 0, 0
Files.PrivateUsersChown, config_parse_tristate, 0, offsetof(Settings, userns_chown)
Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network) Network.Private, config_parse_tristate, 0, offsetof(Settings, private_network)
Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces) Network.Interface, config_parse_strv, 0, offsetof(Settings, network_interfaces)
Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan) Network.MACVLAN, config_parse_strv, 0, offsetof(Settings, network_macvlan)

View File

@ -0,0 +1,469 @@
/***
This file is part of systemd.
Copyright 2016 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <fcntl.h>
#include <linux/magic.h>
#ifdef HAVE_ACL
#include <sys/acl.h>
#endif
#include <sys/stat.h>
#include <sys/vfs.h>
#include <unistd.h>
#include "acl-util.h"
#include "dirent-util.h"
#include "fd-util.h"
#include "missing.h"
#include "nspawn-patch-uid.h"
#include "stat-util.h"
#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
#include "user-util.h"
#ifdef HAVE_ACL
static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
acl_t acl;
assert(fd >= 0);
assert(ret);
if (name) {
_cleanup_close_ int child_fd = -1;
child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
if (child_fd < 0)
return -errno;
xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
acl = acl_get_file(procfs_path, type);
} else if (type == ACL_TYPE_ACCESS)
acl = acl_get_fd(fd);
else {
xsprintf(procfs_path, "/proc/self/fd/%i", fd);
acl = acl_get_file(procfs_path, type);
}
if (!acl)
return -errno;
*ret = acl;
return 0;
}
static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
char procfs_path[strlen("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
int r;
assert(fd >= 0);
assert(acl);
if (name) {
_cleanup_close_ int child_fd = -1;
child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
if (child_fd < 0)
return -errno;
xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
r = acl_set_file(procfs_path, type, acl);
} else if (type == ACL_TYPE_ACCESS)
r = acl_set_fd(fd, acl);
else {
xsprintf(procfs_path, "/proc/self/fd/%i", fd);
r = acl_set_file(procfs_path, type, acl);
}
if (r < 0)
return -errno;
return 0;
}
static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
_cleanup_(acl_freep) acl_t copy = NULL;
acl_entry_t i;
int r;
assert(acl);
assert(ret);
r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
if (r < 0)
return -errno;
while (r > 0) {
uid_t *old_uid, new_uid;
bool modify = false;
acl_tag_t tag;
if (acl_get_tag_type(i, &tag) < 0)
return -errno;
if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
/* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
* this is actually OK */
assert_cc(sizeof(uid_t) == sizeof(gid_t));
old_uid = acl_get_qualifier(i);
if (!old_uid)
return -errno;
new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
if (!uid_is_valid(new_uid))
return -EINVAL;
modify = new_uid != *old_uid;
if (modify && !copy) {
int n;
/* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
* beginning, so that we copy all entries, starting from the first, this time. */
n = acl_entries(acl);
if (n < 0)
return -errno;
copy = acl_init(n);
if (!copy)
return -errno;
/* Seek back to the beginning */
r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
if (r < 0)
return -errno;
continue;
}
}
if (copy) {
acl_entry_t new_entry;
if (acl_create_entry(&copy, &new_entry) < 0)
return -errno;
if (acl_copy_entry(new_entry, i) < 0)
return -errno;
if (modify)
if (acl_set_qualifier(new_entry, &new_uid) < 0)
return -errno;
}
r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
if (r < 0)
return -errno;
}
*ret = copy;
copy = NULL;
return !!*ret;
}
static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
_cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
bool changed = false;
int r;
assert(fd >= 0);
assert(st);
/* ACLs are not supported on symlinks, there's no point in trying */
if (S_ISLNK(st->st_mode))
return 0;
r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
if (r == -EOPNOTSUPP)
return 0;
if (r < 0)
return r;
r = shift_acl(acl, shift, &shifted);
if (r < 0)
return r;
if (r > 0) {
r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
if (r < 0)
return r;
changed = true;
}
if (S_ISDIR(st->st_mode)) {
acl_free(acl);
acl_free(shifted);
acl = shifted = NULL;
r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
if (r < 0)
return r;
r = shift_acl(acl, shift, &shifted);
if (r < 0)
return r;
if (r > 0) {
r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
if (r < 0)
return r;
changed = true;
}
}
return changed;
}
#else
static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
return 0;
}
#endif
static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
uid_t new_uid;
gid_t new_gid;
bool changed = false;
int r;
assert(fd >= 0);
assert(st);
new_uid = shift | (st->st_uid & UINT32_C(0xFFFF));
new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
return -EINVAL;
if (st->st_uid != new_uid || st->st_gid != new_gid) {
if (name)
r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
else
r = fchown(fd, new_uid, new_gid);
if (r < 0)
return -errno;
/* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
if (name && !S_ISLNK(st->st_mode))
r = fchmodat(fd, name, st->st_mode, 0);
else
r = fchmod(fd, st->st_mode);
if (r < 0)
return -errno;
changed = true;
}
r = patch_acls(fd, name, st, shift);
if (r < 0)
return r;
return r > 0 || changed;
}
static int is_procfs_sysfs_or_suchlike(int fd) {
struct statfs sfs;
assert(fd >= 0);
if (fstatfs(fd, &sfs) < 0)
return -errno;
return F_TYPE_EQUAL(sfs.f_type, BINFMTFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, CGROUP_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, CGROUP2_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, DEBUGFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, DEVPTS_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, EFIVARFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, HUGETLBFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, MQUEUE_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, PROC_SUPER_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, PSTOREFS_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SELINUX_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SMACK_MAGIC) ||
F_TYPE_EQUAL(sfs.f_type, SYSFS_MAGIC);
}
static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
bool changed = false;
int r;
assert(fd >= 0);
/* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we
* probably shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's
* stop the recursion when we hit a procfs or sysfs file system. */
r = is_procfs_sysfs_or_suchlike(fd);
if (r < 0)
goto finish;
if (r > 0) {
r = 0; /* don't recurse */
goto finish;
}
r = patch_fd(fd, NULL, st, shift);
if (r == -EROFS) {
_cleanup_free_ char *name = NULL;
if (!is_toplevel) {
/* When we hit a ready-only subtree we simply skip it, but log about it. */
(void) fd_get_path(fd, &name);
log_debug("Skippping read-only file or directory %s.", strna(name));
r = 0;
}
goto finish;
}
if (r < 0)
goto finish;
if (S_ISDIR(st->st_mode)) {
_cleanup_closedir_ DIR *d = NULL;
struct dirent *de;
if (!donate_fd) {
int copy;
copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
if (copy < 0) {
r = -errno;
goto finish;
}
fd = copy;
donate_fd = true;
}
d = fdopendir(fd);
if (!d) {
r = -errno;
goto finish;
}
fd = -1;
FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
struct stat fst;
if (STR_IN_SET(de->d_name, ".", ".."))
continue;
if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
r = -errno;
goto finish;
}
if (S_ISDIR(fst.st_mode)) {
int subdir_fd;
subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
if (subdir_fd < 0) {
r = -errno;
goto finish;
}
r = recurse_fd(subdir_fd, true, &fst, shift, false);
if (r < 0)
goto finish;
if (r > 0)
changed = true;
} else {
r = patch_fd(dirfd(d), de->d_name, &fst, shift);
if (r < 0)
goto finish;
if (r > 0)
changed = true;
}
}
}
r = changed;
finish:
if (donate_fd)
safe_close(fd);
return r;
}
static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
struct stat st;
int r;
assert(fd >= 0);
/* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
* OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
* following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
* UID within the container. */
if ((shift & 0xFFFF) != 0) {
/* We only support containers where the shift starts at a 2^16 boundary */
r = -EOPNOTSUPP;
goto finish;
}
if (range != 0x10000) {
/* We only support containers with 16bit UID ranges for the patching logic */
r = -EOPNOTSUPP;
goto finish;
}
if (fstat(fd, &st) < 0) {
r = -errno;
goto finish;
}
if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
/* We only support containers where the uid/gid container ID match */
r = -EBADE;
goto finish;
}
/* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
* that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
return 0;
return recurse_fd(fd, donate_fd, &st, shift, true);
finish:
if (donate_fd)
safe_close(fd);
return r;
}
int fd_patch_uid(int fd, uid_t shift, uid_t range) {
return fd_patch_uid_internal(fd, false, shift, range);
}
int path_patch_uid(const char *path, uid_t shift, uid_t range) {
int fd;
fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
if (fd < 0)
return -errno;
return fd_patch_uid_internal(fd, true, shift, range);
}

View File

@ -0,0 +1,23 @@
/***
This file is part of systemd.
Copyright 2016 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <sys/types.h>
int fd_patch_uid(int fd, uid_t shift, uid_t range);
int path_patch_uid(const char *path, uid_t shift, uid_t range);

View File

@ -25,7 +25,9 @@
#include "parse-util.h" #include "parse-util.h"
#include "process-util.h" #include "process-util.h"
#include "strv.h" #include "strv.h"
#include "user-util.h"
#include "util.h" #include "util.h"
#include "string-util.h"
int settings_load(FILE *f, const char *path, Settings **ret) { int settings_load(FILE *f, const char *path, Settings **ret) {
_cleanup_(settings_freep) Settings *s = NULL; _cleanup_(settings_freep) Settings *s = NULL;
@ -40,9 +42,13 @@ int settings_load(FILE *f, const char *path, Settings **ret) {
s->start_mode = _START_MODE_INVALID; s->start_mode = _START_MODE_INVALID;
s->personality = PERSONALITY_INVALID; s->personality = PERSONALITY_INVALID;
s->userns_mode = _USER_NAMESPACE_MODE_INVALID;
s->uid_shift = UID_INVALID;
s->uid_range = UID_INVALID;
s->read_only = -1; s->read_only = -1;
s->volatile_mode = _VOLATILE_MODE_INVALID; s->volatile_mode = _VOLATILE_MODE_INVALID;
s->userns_chown = -1;
s->private_network = -1; s->private_network = -1;
s->network_veth = -1; s->network_veth = -1;
@ -59,6 +65,16 @@ int settings_load(FILE *f, const char *path, Settings **ret) {
if (r < 0) if (r < 0)
return r; return r;
/* Make sure that if userns_mode is set, userns_chown is set to something appropriate, and vice versa. Either
* both fields shall be initialized or neither. */
if (s->userns_mode == USER_NAMESPACE_PICK)
s->userns_chown = true;
else if (s->userns_mode != _USER_NAMESPACE_MODE_INVALID && s->userns_chown < 0)
s->userns_chown = false;
if (s->userns_chown >= 0 && s->userns_mode == _USER_NAMESPACE_MODE_INVALID)
s->userns_mode = USER_NAMESPACE_NO;
*ret = s; *ret = s;
s = NULL; s = NULL;
@ -392,3 +408,73 @@ conflict:
log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring."); log_syntax(unit, LOG_ERR, filename, line, r, "Conflicting Boot= or ProcessTwo= setting found. Ignoring.");
return 0; return 0;
} }
int config_parse_private_users(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
Settings *settings = data;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
r = parse_boolean(rvalue);
if (r == 0) {
/* no: User namespacing off */
settings->userns_mode = USER_NAMESPACE_NO;
settings->uid_shift = UID_INVALID;
settings->uid_range = UINT32_C(0x10000);
} else if (r > 0) {
/* yes: User namespacing on, UID range is read from root dir */
settings->userns_mode = USER_NAMESPACE_FIXED;
settings->uid_shift = UID_INVALID;
settings->uid_range = UINT32_C(0x10000);
} else if (streq(rvalue, "pick")) {
/* pick: User namespacing on, UID range is picked randomly */
settings->userns_mode = USER_NAMESPACE_PICK;
settings->uid_shift = UID_INVALID;
settings->uid_range = UINT32_C(0x10000);
} else {
const char *range, *shift;
uid_t sh, rn;
/* anything else: User namespacing on, UID range is explicitly configured */
range = strchr(rvalue, ':');
if (range) {
shift = strndupa(rvalue, range - rvalue);
range++;
r = safe_atou32(range, &rn);
if (r < 0 || rn <= 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID range invalid, ignoring: %s", range);
return 0;
}
} else {
shift = rvalue;
rn = UINT32_C(0x10000);
}
r = parse_uid(shift, &sh);
if (r < 0) {
log_syntax(unit, LOG_ERR, filename, line, r, "UID/GID shift invalid, ignoring: %s", range);
return 0;
}
settings->userns_mode = USER_NAMESPACE_FIXED;
settings->uid_shift = sh;
settings->uid_range = rn;
}
return 0;
}

View File

@ -33,6 +33,14 @@ typedef enum StartMode {
_START_MODE_INVALID = -1 _START_MODE_INVALID = -1
} StartMode; } StartMode;
typedef enum UserNamespaceMode {
USER_NAMESPACE_NO,
USER_NAMESPACE_FIXED,
USER_NAMESPACE_PICK,
_USER_NAMESPACE_MODE_MAX,
_USER_NAMESPACE_MODE_INVALID = -1,
} UserNamespaceMode;
typedef enum SettingsMask { typedef enum SettingsMask {
SETTING_START_MODE = 1 << 0, SETTING_START_MODE = 1 << 0,
SETTING_ENVIRONMENT = 1 << 1, SETTING_ENVIRONMENT = 1 << 1,
@ -47,7 +55,8 @@ typedef enum SettingsMask {
SETTING_VOLATILE_MODE = 1 << 10, SETTING_VOLATILE_MODE = 1 << 10,
SETTING_CUSTOM_MOUNTS = 1 << 11, SETTING_CUSTOM_MOUNTS = 1 << 11,
SETTING_WORKING_DIRECTORY = 1 << 12, SETTING_WORKING_DIRECTORY = 1 << 12,
_SETTINGS_MASK_ALL = (1 << 13) -1 SETTING_USERNS = 1 << 13,
_SETTINGS_MASK_ALL = (1 << 14) -1
} SettingsMask; } SettingsMask;
typedef struct Settings { typedef struct Settings {
@ -62,12 +71,15 @@ typedef struct Settings {
unsigned long personality; unsigned long personality;
sd_id128_t machine_id; sd_id128_t machine_id;
char *working_directory; char *working_directory;
UserNamespaceMode userns_mode;
uid_t uid_shift, uid_range;
/* [Image] */ /* [Image] */
int read_only; int read_only;
VolatileMode volatile_mode; VolatileMode volatile_mode;
CustomMount *custom_mounts; CustomMount *custom_mounts;
unsigned n_custom_mounts; unsigned n_custom_mounts;
int userns_chown;
/* [Network] */ /* [Network] */
int private_network; int private_network;
@ -99,3 +111,4 @@ int config_parse_tmpfs(const char *unit, const char *filename, unsigned line, co
int config_parse_veth_extra(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_veth_extra(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata); int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);

View File

@ -22,7 +22,9 @@
#endif #endif
#include <errno.h> #include <errno.h>
#include <getopt.h> #include <getopt.h>
#include <grp.h>
#include <linux/loop.h> #include <linux/loop.h>
#include <pwd.h>
#include <sched.h> #include <sched.h>
#ifdef HAVE_SECCOMP #ifdef HAVE_SECCOMP
#include <seccomp.h> #include <seccomp.h>
@ -75,6 +77,7 @@
#include "nspawn-expose-ports.h" #include "nspawn-expose-ports.h"
#include "nspawn-mount.h" #include "nspawn-mount.h"
#include "nspawn-network.h" #include "nspawn-network.h"
#include "nspawn-patch-uid.h"
#include "nspawn-register.h" #include "nspawn-register.h"
#include "nspawn-settings.h" #include "nspawn-settings.h"
#include "nspawn-setuid.h" #include "nspawn-setuid.h"
@ -101,6 +104,11 @@
#include "user-util.h" #include "user-util.h"
#include "util.h" #include "util.h"
/* Note that devpts's gid= parameter parses GIDs as signed values, hence we stay away from the upper half of the 32bit
* UID range here */
#define UID_SHIFT_PICK_MIN ((uid_t) UINT32_C(0x00080000))
#define UID_SHIFT_PICK_MAX ((uid_t) UINT32_C(0x6FFF0000))
typedef enum ContainerStatus { typedef enum ContainerStatus {
CONTAINER_TERMINATED, CONTAINER_TERMINATED,
CONTAINER_REBOOTED CONTAINER_REBOOTED
@ -173,8 +181,9 @@ static char *arg_image = NULL;
static VolatileMode arg_volatile_mode = VOLATILE_NO; static VolatileMode arg_volatile_mode = VOLATILE_NO;
static ExposePort *arg_expose_ports = NULL; static ExposePort *arg_expose_ports = NULL;
static char **arg_property = NULL; static char **arg_property = NULL;
static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U; static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns = false; static bool arg_userns_chown = false;
static int arg_kill_signal = 0; static int arg_kill_signal = 0;
static bool arg_unified_cgroup_hierarchy = false; static bool arg_unified_cgroup_hierarchy = false;
static SettingsMask arg_settings_mask = 0; static SettingsMask arg_settings_mask = 0;
@ -202,8 +211,10 @@ static void help(void) {
" --uuid=UUID Set a specific machine UUID for the container\n" " --uuid=UUID Set a specific machine UUID for the container\n"
" -S --slice=SLICE Place the container in the specified slice\n" " -S --slice=SLICE Place the container in the specified slice\n"
" --property=NAME=VALUE Set scope unit property\n" " --property=NAME=VALUE Set scope unit property\n"
" -U --private-users=pick Run within user namespace, pick UID/GID range automatically\n"
" --private-users[=UIDBASE[:NUIDS]]\n" " --private-users[=UIDBASE[:NUIDS]]\n"
" Run within user namespace\n" " Run within user namespace, user configured UID/GID range\n"
" --private-user-chown Adjust OS tree file ownership for private UID/GID range\n"
" --private-network Disable network in container\n" " --private-network Disable network in container\n"
" --network-interface=INTERFACE\n" " --network-interface=INTERFACE\n"
" Assign an existing network interface to the\n" " Assign an existing network interface to the\n"
@ -272,10 +283,16 @@ static int custom_mounts_prepare(void) {
for (i = 0; i < arg_n_custom_mounts; i++) { for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i]; CustomMount *m = &arg_custom_mounts[i];
if (arg_userns && arg_uid_shift == UID_INVALID && path_equal(m->destination, "/")) { if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
if (arg_userns_chown) {
log_error("--private-users-chown may not be combined with custom root mounts.");
return -EINVAL;
} else if (arg_uid_shift == UID_INVALID) {
log_error("--private-users with automatic UID shift may not be combined with custom root mounts."); log_error("--private-users with automatic UID shift may not be combined with custom root mounts.");
return -EINVAL; return -EINVAL;
} }
}
if (m->type != CUSTOM_MOUNT_OVERLAY) if (m->type != CUSTOM_MOUNT_OVERLAY)
continue; continue;
@ -349,6 +366,7 @@ static int parse_argv(int argc, char *argv[]) {
ARG_KILL_SIGNAL, ARG_KILL_SIGNAL,
ARG_SETTINGS, ARG_SETTINGS,
ARG_CHDIR, ARG_CHDIR,
ARG_PRIVATE_USERS_CHOWN,
}; };
static const struct option options[] = { static const struct option options[] = {
@ -392,6 +410,7 @@ static int parse_argv(int argc, char *argv[]) {
{ "port", required_argument, NULL, 'p' }, { "port", required_argument, NULL, 'p' },
{ "property", required_argument, NULL, ARG_PROPERTY }, { "property", required_argument, NULL, ARG_PROPERTY },
{ "private-users", optional_argument, NULL, ARG_PRIVATE_USERS }, { "private-users", optional_argument, NULL, ARG_PRIVATE_USERS },
{ "private-users-chown", optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN},
{ "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL }, { "kill-signal", required_argument, NULL, ARG_KILL_SIGNAL },
{ "settings", required_argument, NULL, ARG_SETTINGS }, { "settings", required_argument, NULL, ARG_SETTINGS },
{ "chdir", required_argument, NULL, ARG_CHDIR }, { "chdir", required_argument, NULL, ARG_CHDIR },
@ -406,7 +425,7 @@ static int parse_argv(int argc, char *argv[]) {
assert(argc >= 0); assert(argc >= 0);
assert(argv); assert(argv);
while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:n", options, NULL)) >= 0) while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nU", options, NULL)) >= 0)
switch (c) { switch (c) {
@ -797,10 +816,29 @@ static int parse_argv(int argc, char *argv[]) {
break; break;
case ARG_PRIVATE_USERS: case ARG_PRIVATE_USERS:
if (optarg) {
r = optarg ? parse_boolean(optarg) : 1;
if (r == 0) {
/* no: User namespacing off */
arg_userns_mode = USER_NAMESPACE_NO;
arg_uid_shift = UID_INVALID;
arg_uid_range = UINT32_C(0x10000);
} else if (r > 0) {
/* yes: User namespacing on, UID range is read from root dir */
arg_userns_mode = USER_NAMESPACE_FIXED;
arg_uid_shift = UID_INVALID;
arg_uid_range = UINT32_C(0x10000);
} else if (streq(optarg, "pick")) {
/* pick: User namespacing on, UID range is picked randomly */
arg_userns_mode = USER_NAMESPACE_PICK;
arg_uid_shift = UID_INVALID;
arg_uid_range = UINT32_C(0x10000);
} else {
_cleanup_free_ char *buffer = NULL; _cleanup_free_ char *buffer = NULL;
const char *range, *shift; const char *range, *shift;
/* anything else: User namespacing on, UID range is explicitly configured */
range = strchr(optarg, ':'); range = strchr(optarg, ':');
if (range) { if (range) {
buffer = strndup(optarg, range - optarg); buffer = strndup(optarg, range - optarg);
@ -820,9 +858,28 @@ static int parse_argv(int argc, char *argv[]) {
log_error("Failed to parse UID: %s", optarg); log_error("Failed to parse UID: %s", optarg);
return -EINVAL; return -EINVAL;
} }
arg_userns_mode = USER_NAMESPACE_FIXED;
} }
arg_userns = true; arg_settings_mask |= SETTING_USERNS;
break;
case 'U':
if (userns_supported()) {
arg_userns_mode = USER_NAMESPACE_PICK;
arg_uid_shift = UID_INVALID;
arg_uid_range = UINT32_C(0x10000);
arg_settings_mask |= SETTING_USERNS;
}
break;
case ARG_PRIVATE_USERS_CHOWN:
arg_userns_chown = true;
arg_settings_mask |= SETTING_USERNS;
break; break;
case ARG_KILL_SIGNAL: case ARG_KILL_SIGNAL:
@ -893,6 +950,9 @@ static int parse_argv(int argc, char *argv[]) {
if (arg_share_system) if (arg_share_system)
arg_register = false; arg_register = false;
if (arg_userns_mode == USER_NAMESPACE_PICK)
arg_userns_chown = true;
if (arg_start_mode != START_PID1 && arg_share_system) { if (arg_start_mode != START_PID1 && arg_share_system) {
log_error("--boot and --share-system may not be combined."); log_error("--boot and --share-system may not be combined.");
return -EINVAL; return -EINVAL;
@ -933,8 +993,15 @@ static int parse_argv(int argc, char *argv[]) {
return -EINVAL; return -EINVAL;
} }
if (arg_userns && access("/proc/self/uid_map", F_OK) < 0) if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported()) {
return log_error_errno(EOPNOTSUPP, "--private-users= is not supported, kernel compiled without user namespace support."); log_error("--private-users= is not supported, kernel compiled without user namespace support.");
return -EOPNOTSUPP;
}
if (arg_userns_chown && arg_read_only) {
log_error("--read-only and --private-users-chown may not be combined.");
return -EINVAL;
}
if (argc > optind) { if (argc > optind) {
arg_parameters = strv_copy(argv + optind); arg_parameters = strv_copy(argv + optind);
@ -993,7 +1060,7 @@ static int verify_arguments(void) {
static int userns_lchown(const char *p, uid_t uid, gid_t gid) { static int userns_lchown(const char *p, uid_t uid, gid_t gid) {
assert(p); assert(p);
if (!arg_userns) if (arg_userns_mode == USER_NAMESPACE_NO)
return 0; return 0;
if (uid == UID_INVALID && gid == GID_INVALID) if (uid == UID_INVALID && gid == GID_INVALID)
@ -2218,6 +2285,29 @@ static int setup_machine_id(const char *directory) {
return 0; return 0;
} }
static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
int r;
assert(directory);
if (arg_userns_mode == USER_NAMESPACE_NO || !arg_userns_chown)
return 0;
r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
if (r == -EOPNOTSUPP)
return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
if (r == -EBADE)
return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
if (r < 0)
return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
if (r == 0)
log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
else
log_debug("Patched directory tree to match UID/GID range.");
return r;
}
static int mount_devices( static int mount_devices(
const char *where, const char *where,
const char *root_device, bool root_device_rw, const char *root_device, bool root_device_rw,
@ -2435,7 +2525,7 @@ static int determine_names(void) {
static int determine_uid_shift(const char *directory) { static int determine_uid_shift(const char *directory) {
int r; int r;
if (!arg_userns) { if (arg_userns_mode == USER_NAMESPACE_NO) {
arg_uid_shift = 0; arg_uid_shift = 0;
return 0; return 0;
} }
@ -2462,7 +2552,6 @@ static int determine_uid_shift(const char *directory) {
return -EINVAL; return -EINVAL;
} }
log_info("Using user namespaces with base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
return 0; return 0;
} }
@ -2499,7 +2588,7 @@ static int inner_child(
cg_unified_flush(); cg_unified_flush();
if (arg_userns) { if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Tell the parent, that it now can write the UID map. */ /* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */ (void) barrier_place(barrier); /* #1 */
@ -2510,7 +2599,14 @@ static int inner_child(
} }
} }
r = mount_all(NULL, arg_userns, true, arg_uid_shift, arg_private_network, arg_uid_range, arg_selinux_apifs_context); r = mount_all(NULL,
arg_userns_mode != USER_NAMESPACE_NO,
true,
arg_private_network,
arg_uid_shift,
arg_uid_range,
arg_selinux_apifs_context);
if (r < 0) if (r < 0)
return r; return r;
@ -2749,7 +2845,8 @@ static int outer_child(
if (r < 0) if (r < 0)
return r; return r;
if (arg_userns) { if (arg_userns_mode != USER_NAMESPACE_NO) {
/* Let the parent know which UID shift we read from the image */
l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL); l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
if (l < 0) if (l < 0)
return log_error_errno(errno, "Failed to send UID shift: %m"); return log_error_errno(errno, "Failed to send UID shift: %m");
@ -2757,17 +2854,49 @@ static int outer_child(
log_error("Short write while sending UID shift."); log_error("Short write while sending UID shift.");
return -EIO; return -EIO;
} }
if (arg_userns_mode == USER_NAMESPACE_PICK) {
/* When we are supposed to pick the UID shift, the parent will check now whether the UID shift
* we just read from the image is available. If yes, it will send the UID shift back to us, if
* not it will pick a different one, and send it back to us. */
l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
if (l < 0)
return log_error_errno(errno, "Failed to recv UID shift: %m");
if (l != sizeof(arg_uid_shift)) {
log_error("Short read while recieving UID shift.");
return -EIO;
}
}
log_info("Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
} }
/* Turn directory into bind mount */ /* Turn directory into bind mount */
if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0) if (mount(directory, directory, NULL, MS_BIND|MS_REC, NULL) < 0)
return log_error_errno(errno, "Failed to make bind mount: %m"); return log_error_errno(errno, "Failed to make bind mount: %m");
r = setup_volatile(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
if (r < 0) if (r < 0)
return r; return r;
r = setup_volatile_state(directory, arg_volatile_mode, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_context); r = setup_volatile(
directory,
arg_volatile_mode,
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
arg_selinux_context);
if (r < 0)
return r;
r = setup_volatile_state(
directory,
arg_volatile_mode,
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
arg_selinux_context);
if (r < 0) if (r < 0)
return r; return r;
@ -2781,7 +2910,13 @@ static int outer_child(
return log_error_errno(r, "Failed to make tree read-only: %m"); return log_error_errno(r, "Failed to make tree read-only: %m");
} }
r = mount_all(directory, arg_userns, false, arg_private_network, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); r = mount_all(directory,
arg_userns_mode != USER_NAMESPACE_NO,
false,
arg_private_network,
arg_uid_shift,
arg_uid_range,
arg_selinux_apifs_context);
if (r < 0) if (r < 0)
return r; return r;
@ -2823,11 +2958,24 @@ static int outer_child(
if (r < 0) if (r < 0)
return r; return r;
r = mount_custom(directory, arg_custom_mounts, arg_n_custom_mounts, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); r = mount_custom(
directory,
arg_custom_mounts,
arg_n_custom_mounts,
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
arg_selinux_apifs_context);
if (r < 0) if (r < 0)
return r; return r;
r = mount_cgroups(directory, arg_unified_cgroup_hierarchy, arg_userns, arg_uid_shift, arg_uid_range, arg_selinux_apifs_context); r = mount_cgroups(
directory,
arg_unified_cgroup_hierarchy,
arg_userns_mode != USER_NAMESPACE_NO,
arg_uid_shift,
arg_uid_range,
arg_selinux_apifs_context);
if (r < 0) if (r < 0)
return r; return r;
@ -2838,7 +2986,7 @@ static int outer_child(
pid = raw_clone(SIGCHLD|CLONE_NEWNS| pid = raw_clone(SIGCHLD|CLONE_NEWNS|
(arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) | (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS) |
(arg_private_network ? CLONE_NEWNET : 0) | (arg_private_network ? CLONE_NEWNET : 0) |
(arg_userns ? CLONE_NEWUSER : 0), (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0),
NULL); NULL);
if (pid < 0) if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m"); return log_error_errno(errno, "Failed to fork inner child: %m");
@ -2882,6 +3030,61 @@ static int outer_child(
return 0; return 0;
} }
static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
unsigned n_tries = 100;
uid_t candidate;
int r;
assert(shift);
assert(ret_lock_file);
assert(arg_userns_mode == USER_NAMESPACE_PICK);
assert(arg_uid_range == 0x10000U);
candidate = *shift;
(void) mkdir("/run/systemd/nspawn-uid", 0755);
for (;;) {
char lock_path[strlen("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
_cleanup_release_lock_file_ LockFile lf = LOCK_FILE_INIT;
if (--n_tries <= 0)
return -EBUSY;
if (candidate < UID_SHIFT_PICK_MIN || candidate > UID_SHIFT_PICK_MAX)
goto next;
if ((candidate & UINT32_C(0xFFFF)) != 0)
goto next;
xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
if (r == -EBUSY) /* Range already taken by another nspawn instance */
goto next;
if (r < 0)
return r;
/* Make some superficial checks whether the range is currently known in the user database */
if (getpwuid(candidate))
goto next;
if (getpwuid(candidate + UINT32_C(0xFFFE)))
goto next;
if (getgrgid(candidate))
goto next;
if (getgrgid(candidate + UINT32_C(0xFFFE)))
goto next;
*ret_lock_file = lf;
lf = (struct LockFile) LOCK_FILE_INIT;
*shift = candidate;
return 0;
next:
random_bytes(&candidate, sizeof(candidate));
candidate = (candidate % (UID_SHIFT_PICK_MAX - UID_SHIFT_PICK_MIN)) + UID_SHIFT_PICK_MIN;
candidate &= (uid_t) UINT32_C(0xFFFF0000);
}
}
static int setup_uid_map(pid_t pid) { static int setup_uid_map(pid_t pid) {
char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1]; char uid_map[strlen("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1], line[DECIMAL_STR_MAX(uid_t)*3+3+1];
int r; int r;
@ -3113,6 +3316,19 @@ static int load_settings(void) {
} }
} }
if ((arg_settings_mask & SETTING_USERNS) == 0 &&
settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
if (!arg_settings_trusted)
log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", p);
else {
arg_userns_mode = settings->userns_mode;
arg_uid_shift = settings->uid_shift;
arg_uid_range = settings->uid_range;
arg_userns_chown = settings->userns_chown;
}
}
return 0; return 0;
} }
@ -3351,20 +3567,42 @@ int main(int argc, char *argv[]) {
} }
for (;;) { for (;;) {
_cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 },
pid_socket_pair[2] = { -1, -1 }, uuid_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 };
ContainerStatus container_status;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
static const struct sigaction sa = { static const struct sigaction sa = {
.sa_handler = nop_signal_handler, .sa_handler = nop_signal_handler,
.sa_flags = SA_NOCLDSTOP, .sa_flags = SA_NOCLDSTOP,
}; };
int ifi = 0;
ssize_t l; _cleanup_release_lock_file_ LockFile uid_shift_lock = LOCK_FILE_INIT;
_cleanup_close_ int etc_passwd_lock = -1;
_cleanup_close_pair_ int
kmsg_socket_pair[2] = { -1, -1 },
rtnl_socket_pair[2] = { -1, -1 },
pid_socket_pair[2] = { -1, -1 },
uuid_socket_pair[2] = { -1, -1 },
uid_shift_socket_pair[2] = { -1, -1 };
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
_cleanup_(sd_event_unrefp) sd_event *event = NULL; _cleanup_(sd_event_unrefp) sd_event *event = NULL;
_cleanup_(pty_forward_freep) PTYForward *forward = NULL; _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
_cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL; _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
ContainerStatus container_status;
char last_char = 0; char last_char = 0;
int ifi = 0;
ssize_t l;
if (arg_userns_mode == USER_NAMESPACE_PICK) {
/* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
* check with getpwuid() if the specific user already exists. Note that /etc might be
* read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
* can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
* really just an extra safety net. We kinda assume that the UID range we allocate from is
* really ours. */
etc_passwd_lock = take_etc_passwd_lock(NULL);
if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS) {
log_error_errno(r, "Failed to take /etc/passwd lock: %m");
goto finish;
}
}
r = barrier_create(&barrier); r = barrier_create(&barrier);
if (r < 0) { if (r < 0) {
@ -3392,7 +3630,7 @@ int main(int argc, char *argv[]) {
goto finish; goto finish;
} }
if (arg_userns) if (arg_userns_mode != USER_NAMESPACE_NO)
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) { if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0) {
r = log_error_errno(errno, "Failed to create uid shift socket pair: %m"); r = log_error_errno(errno, "Failed to create uid shift socket pair: %m");
goto finish; goto finish;
@ -3468,6 +3706,43 @@ int main(int argc, char *argv[]) {
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]); uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* The child just let us know the UID shift it might have read from the image. */
l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
if (l < 0) {
r = log_error_errno(errno, "Failed to read UID shift: %m");
goto finish;
}
if (l != sizeof(arg_uid_shift)) {
log_error("Short read while reading UID shift.");
r = EIO;
goto finish;
}
if (arg_userns_mode == USER_NAMESPACE_PICK) {
/* If we are supposed to pick the UID shift, let's try to use the shift read from the
* image, but if that's already in use, pick a new one, and report back to the child,
* which one we now picked. */
r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
if (r < 0) {
log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
goto finish;
}
l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
if (l < 0) {
r = log_error_errno(errno, "Failed to send UID shift: %m");
goto finish;
}
if (l != sizeof(arg_uid_shift)) {
log_error("Short write while writing UID shift.");
r = -EIO;
goto finish;
}
}
}
/* Wait for the outer child. */ /* Wait for the outer child. */
r = wait_for_terminate_and_warn("namespace helper", pid, NULL); r = wait_for_terminate_and_warn("namespace helper", pid, NULL);
if (r < 0) if (r < 0)
@ -3504,24 +3779,13 @@ int main(int argc, char *argv[]) {
log_debug("Init process invoked as PID " PID_FMT, pid); log_debug("Init process invoked as PID " PID_FMT, pid);
if (arg_userns) { if (arg_userns_mode != USER_NAMESPACE_NO) {
if (!barrier_place_and_sync(&barrier)) { /* #1 */ if (!barrier_place_and_sync(&barrier)) { /* #1 */
log_error("Child died too early."); log_error("Child died too early.");
r = -ESRCH; r = -ESRCH;
goto finish; goto finish;
} }
l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof(arg_uid_shift), 0);
if (l < 0) {
r = log_error_errno(errno, "Failed to read UID shift: %m");
goto finish;
}
if (l != sizeof(arg_uid_shift)) {
log_error("Short read while reading UID shift.");
r = EIO;
goto finish;
}
r = setup_uid_map(pid); r = setup_uid_map(pid);
if (r < 0) if (r < 0)
goto finish; goto finish;
@ -3619,6 +3883,10 @@ int main(int argc, char *argv[]) {
goto finish; goto finish;
} }
/* At this point we have made use of the UID we picked, and thus nss-mymachines will make them appear
* in getpwuid(), thus we can release the /etc/passwd lock. */
etc_passwd_lock = safe_close(etc_passwd_lock);
sd_notifyf(false, sd_notifyf(false,
"READY=1\n" "READY=1\n"
"STATUS=Container running.\n" "STATUS=Container running.\n"

View File

@ -0,0 +1,61 @@
/***
This file is part of systemd.
Copyright 2016 Lennart Poettering
systemd is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version.
systemd is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
#include <stdlib.h>
#include "log.h"
#include "nspawn-patch-uid.h"
#include "user-util.h"
#include "util.h"
int main(int argc, char *argv[]) {
uid_t shift, range;
int r;
log_set_max_level(LOG_DEBUG);
log_parse_environment();
log_open();
if (argc != 4) {
log_error("Expected PATH SHIFT RANGE parameters.");
return EXIT_FAILURE;
}
r = parse_uid(argv[2], &shift);
if (r < 0) {
log_error_errno(r, "Failed to parse UID shift %s.", argv[2]);
return EXIT_FAILURE;
}
r = parse_gid(argv[3], &range);
if (r < 0) {
log_error_errno(r, "Failed to parse UID range %s.", argv[3]);
return EXIT_FAILURE;
}
r = path_patch_uid(argv[1], shift, range);
if (r < 0) {
log_error_errno(r, "Failed to patch directory tree: %m");
return EXIT_FAILURE;
}
log_info("Changed: %s", yes_no(r));
return EXIT_SUCCESS;
}

View File

@ -13,7 +13,7 @@ Before=machines.target
After=network.target After=network.target
[Service] [Service]
ExecStart=@bindir@/systemd-nspawn --quiet --keep-unit --boot --link-journal=try-guest --network-veth --settings=override --machine=%i ExecStart=@bindir@/systemd-nspawn --quiet --keep-unit --boot --link-journal=try-guest --network-veth -U --settings=override --machine=%i
KillMode=mixed KillMode=mixed
Type=notify Type=notify
RestartForceExitStatus=133 RestartForceExitStatus=133