mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-08-22 01:50:10 +03:00
nspawn: mount temporary visible procfs and sysfs instance
In order to mount procfs and sysfs in an unprivileged container the kernel requires that a fully visible instance is already present in the target mount namespace. Mount one here so the inner child can mount its own instances. Later we umount the temporary instances created here before we actually exec the payload. Since the rootfs is shared the umount will propagate into the container. Note, the inner child wouldn't be able to unmount the instances on its own since it doesn't own the originating mount namespace. IOW, the outer child needs to do this. So far nspawn didn't run into this issue because it used MS_MOVE which meant that the shadow mount tree pinned a procfs and sysfs instance which the kernel would find. The shadow mount tree is gone with proper pivot_root() semantics. Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
This commit is contained in:
committed by
Christian Brauner (Microsoft)
parent
57c10a5650
commit
b71a0192c0
@ -13,6 +13,7 @@
|
||||
#include "mkdir-label.h"
|
||||
#include "mount-util.h"
|
||||
#include "mountpoint-util.h"
|
||||
#include "namespace-util.h"
|
||||
#include "nspawn-mount.h"
|
||||
#include "parse-util.h"
|
||||
#include "path-util.h"
|
||||
@ -510,6 +511,9 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
|
||||
MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
|
||||
}
|
||||
|
||||
#define PROC_DEFAULT_MOUNT_FLAGS (MS_NOSUID|MS_NOEXEC|MS_NODEV)
|
||||
#define SYS_DEFAULT_MOUNT_FLAGS (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV)
|
||||
|
||||
int mount_all(const char *dest,
|
||||
MountSettingsMask mount_settings,
|
||||
uid_t uid_shift,
|
||||
@ -538,7 +542,7 @@ int mount_all(const char *dest,
|
||||
|
||||
static const MountPoint mount_table[] = {
|
||||
/* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
|
||||
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
{ "proc", "/proc", "proc", NULL, PROC_DEFAULT_MOUNT_FLAGS,
|
||||
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
|
||||
|
||||
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
|
||||
@ -576,7 +580,7 @@ int mount_all(const char *dest,
|
||||
MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
|
||||
{ "tmpfs", "/sys", "tmpfs", "mode=555" TMPFS_LIMITS_SYS, MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
{ "sysfs", "/sys", "sysfs", NULL, SYS_DEFAULT_MOUNT_FLAGS,
|
||||
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR }, /* skipped if above was mounted */
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
MOUNT_FATAL|MOUNT_MKDIR }, /* skipped if above was mounted */
|
||||
@ -1336,3 +1340,60 @@ done:
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
#define NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS "/run/host/proc"
|
||||
#define NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS "/run/host/sys"
|
||||
|
||||
int pin_fully_visible_fs(void) {
|
||||
int r;
|
||||
|
||||
(void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, 0755);
|
||||
(void) mkdir_p(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, 0755);
|
||||
|
||||
r = mount_follow_verbose(LOG_ERR, "proc", NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, "proc", PROC_DEFAULT_MOUNT_FLAGS, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = mount_follow_verbose(LOG_ERR, "sysfs", NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, "sysfs", SYS_DEFAULT_MOUNT_FLAGS, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_wipe_fully_visible_fs(void) {
|
||||
if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS, MNT_DETACH) < 0)
|
||||
return log_error_errno(errno, "Failed to unmount temporary proc: %m");
|
||||
|
||||
if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_PROCFS) < 0)
|
||||
return log_error_errno(errno, "Failed to remove temporary proc mountpoint: %m");
|
||||
|
||||
if (umount2(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS, MNT_DETACH) < 0)
|
||||
return log_error_errno(errno, "Failed to unmount temporary sys: %m");
|
||||
|
||||
if (rmdir(NSPAWN_PRIVATE_FULLY_VISIBLE_SYSFS) < 0)
|
||||
return log_error_errno(errno, "Failed to remove temporary sys mountpoint: %m");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wipe_fully_visible_fs(int mntns_fd) {
|
||||
_cleanup_close_ int orig_mntns_fd = -EBADF;
|
||||
int r, rr;
|
||||
|
||||
r = namespace_open(0, NULL, &orig_mntns_fd, NULL, NULL, NULL);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to pin originating mount namespace: %m");
|
||||
|
||||
r = namespace_enter(-EBADF, mntns_fd, -EBADF, -EBADF, -EBADF);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to enter mount namespace: %m");
|
||||
|
||||
rr = do_wipe_fully_visible_fs();
|
||||
|
||||
r = namespace_enter(-EBADF, orig_mntns_fd, -EBADF, -EBADF, -EBADF);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to enter original mount namespace: %m");
|
||||
|
||||
return rr;
|
||||
}
|
||||
|
@ -67,3 +67,5 @@ int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s
|
||||
int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old);
|
||||
|
||||
int tmpfs_patch_options(const char *options,uid_t uid_shift, const char *selinux_apifs_context, char **ret);
|
||||
int pin_fully_visible_fs(void);
|
||||
int wipe_fully_visible_fs(int mntns_fd);
|
||||
|
@ -3632,7 +3632,7 @@ static int outer_child(
|
||||
|
||||
_cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
|
||||
_cleanup_strv_free_ char **os_release_pairs = NULL;
|
||||
_cleanup_close_ int fd = -1;
|
||||
_cleanup_close_ int fd = -1, mntns_fd = -EBADF;
|
||||
bool idmap = false;
|
||||
const char *p;
|
||||
pid_t pid;
|
||||
@ -3697,6 +3697,15 @@ static int outer_child(
|
||||
return r;
|
||||
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO) {
|
||||
r = namespace_open(0, NULL, &mntns_fd, NULL, NULL, NULL);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to pin outer mount namespace: %m");
|
||||
|
||||
l = send_one_fd(notify_socket, mntns_fd, 0);
|
||||
if (l < 0)
|
||||
return log_error_errno(l, "Failed to send outer mount namespace fd: %m");
|
||||
mntns_fd = safe_close(mntns_fd);
|
||||
|
||||
/* Let the parent know which UID shift we read from the image */
|
||||
l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
|
||||
if (l < 0)
|
||||
@ -3974,6 +3983,20 @@ static int outer_child(
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to move root directory: %m");
|
||||
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO) {
|
||||
/* In order to mount procfs and sysfs in an unprivileged container the kernel
|
||||
* requires that a fully visible instance is already present in the target mount
|
||||
* namespace. Mount one here so the inner child can mount its own instances. Later
|
||||
* we umount the temporary instances created here before we actually exec the
|
||||
* payload. Since the rootfs is shared the umount will propagate into the container.
|
||||
* Note, the inner child wouldn't be able to unmount the instances on its own since
|
||||
* it doesn't own the originating mount namespace. IOW, the outer child needs to do
|
||||
* this. */
|
||||
r = pin_fully_visible_fs();
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
fd = setup_notify_child();
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
@ -4731,12 +4754,12 @@ static int run_container(
|
||||
rtnl_socket_pair[2] = { -1, -1 },
|
||||
pid_socket_pair[2] = { -1, -1 },
|
||||
uuid_socket_pair[2] = { -1, -1 },
|
||||
notify_socket_pair[2] = { -1, -1 },
|
||||
fd_socket_pair[2] = { -EBADF, -EBADF },
|
||||
uid_shift_socket_pair[2] = { -1, -1 },
|
||||
master_pty_socket_pair[2] = { -1, -1 },
|
||||
unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
|
||||
|
||||
_cleanup_close_ int notify_socket = -1;
|
||||
_cleanup_close_ int notify_socket = -1, mntns_fd = -EBADF;
|
||||
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
|
||||
_cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
|
||||
_cleanup_(sd_event_unrefp) sd_event *event = NULL;
|
||||
@ -4783,7 +4806,7 @@ static int run_container(
|
||||
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
|
||||
return log_error_errno(errno, "Failed to create id socket pair: %m");
|
||||
|
||||
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
|
||||
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, fd_socket_pair) < 0)
|
||||
return log_error_errno(errno, "Failed to create notify socket pair: %m");
|
||||
|
||||
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
|
||||
@ -4836,7 +4859,7 @@ static int run_container(
|
||||
rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
|
||||
pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
|
||||
uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
|
||||
notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
|
||||
fd_socket_pair[0] = safe_close(fd_socket_pair[0]);
|
||||
master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
|
||||
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
|
||||
unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
|
||||
@ -4850,7 +4873,7 @@ static int run_container(
|
||||
secondary,
|
||||
pid_socket_pair[1],
|
||||
uuid_socket_pair[1],
|
||||
notify_socket_pair[1],
|
||||
fd_socket_pair[1],
|
||||
kmsg_socket_pair[1],
|
||||
rtnl_socket_pair[1],
|
||||
uid_shift_socket_pair[1],
|
||||
@ -4872,12 +4895,16 @@ static int run_container(
|
||||
rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
|
||||
pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
|
||||
uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
|
||||
notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
|
||||
fd_socket_pair[1] = safe_close(fd_socket_pair[1]);
|
||||
master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
|
||||
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
|
||||
unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
|
||||
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO) {
|
||||
mntns_fd = receive_one_fd(fd_socket_pair[0], 0);
|
||||
if (mntns_fd < 0)
|
||||
return log_error_errno(mntns_fd, "Failed to receive mount namespace fd from outer child: %m");
|
||||
|
||||
/* The child just let us know the UID shift it might have read from the image. */
|
||||
l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
|
||||
if (l < 0)
|
||||
@ -4954,7 +4981,7 @@ static int run_container(
|
||||
return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
|
||||
|
||||
/* We also retrieve the socket used for notifications generated by outer child */
|
||||
notify_socket = receive_one_fd(notify_socket_pair[0], 0);
|
||||
notify_socket = receive_one_fd(fd_socket_pair[0], 0);
|
||||
if (notify_socket < 0)
|
||||
return log_error_errno(notify_socket,
|
||||
"Failed to receive notification socket from the outer child: %m");
|
||||
@ -5139,6 +5166,13 @@ static int run_container(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO) {
|
||||
r = wipe_fully_visible_fs(mntns_fd);
|
||||
if (r < 0)
|
||||
return r;
|
||||
mntns_fd = safe_close(mntns_fd);
|
||||
}
|
||||
|
||||
/* Let the child know that we are ready and wait that the child is completely ready now. */
|
||||
if (!barrier_place_and_sync(&barrier)) /* #5 */
|
||||
return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
|
||||
|
Reference in New Issue
Block a user