mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-01-11 05:17:44 +03:00
nspawn: R/W support for /sys, and /proc/sys
This commit adds the possibility to leave /sys, and /proc/sys read-write. It introduces a new (undocumented) env var SYSTEMD_NSPAWN_API_VFS_WRITABLE to enable this feature. If set to "yes", /sys, and /proc/sys will be read-write. If set to "no", /sys, and /proc/sys will be read-only. If set to "network" /proc/sys/net will be read-write. This is useful in use-cases, where systemd-nspawn is used in an external network namespace. This adds the possibility to start privileged containers which need more control over settings in the /proc, and /sys filesystem. This is also a follow-up on the discussion from https://github.com/systemd/systemd/pull/4018#r76971862 where an introduction of a simple env var to enable R/W support for those directories was already discussed.
This commit is contained in:
parent
843d5baf6a
commit
4f086aab52
@ -225,9 +225,10 @@ static int tmpfs_patch_options(
|
||||
return !!buf;
|
||||
}
|
||||
|
||||
int mount_sysfs(const char *dest) {
|
||||
int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
|
||||
const char *full, *top, *x;
|
||||
int r;
|
||||
unsigned long extra_flags = 0;
|
||||
|
||||
top = prefix_roota(dest, "/sys");
|
||||
r = path_check_fstype(top, SYSFS_MAGIC);
|
||||
@ -244,8 +245,11 @@ int mount_sysfs(const char *dest) {
|
||||
|
||||
(void) mkdir(full, 0755);
|
||||
|
||||
if (mount_settings & MOUNT_APPLY_APIVFS_RO)
|
||||
extra_flags |= MS_RDONLY;
|
||||
|
||||
r = mount_verbose(LOG_ERR, "sysfs", full, "sysfs",
|
||||
MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
|
||||
MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -267,7 +271,7 @@ int mount_sysfs(const char *dest) {
|
||||
return r;
|
||||
|
||||
r = mount_verbose(LOG_ERR, NULL, to, NULL,
|
||||
MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
|
||||
MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
@ -291,7 +295,7 @@ int mount_sysfs(const char *dest) {
|
||||
}
|
||||
|
||||
return mount_verbose(LOG_ERR, NULL, top, NULL,
|
||||
MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL);
|
||||
MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
|
||||
}
|
||||
|
||||
static int mkdir_userns(const char *path, mode_t mode, bool in_userns, uid_t uid_shift) {
|
||||
@ -348,8 +352,7 @@ static int mkdir_userns_p(const char *prefix, const char *path, mode_t mode, boo
|
||||
}
|
||||
|
||||
int mount_all(const char *dest,
|
||||
bool use_userns, bool in_userns,
|
||||
bool use_netns,
|
||||
MountSettingsMask mount_settings,
|
||||
uid_t uid_shift, uid_t uid_range,
|
||||
const char *selinux_apifs_context) {
|
||||
|
||||
@ -359,41 +362,52 @@ int mount_all(const char *dest,
|
||||
const char *type;
|
||||
const char *options;
|
||||
unsigned long flags;
|
||||
bool fatal;
|
||||
bool in_userns;
|
||||
bool use_netns;
|
||||
MountSettingsMask mount_settings;
|
||||
} MountPoint;
|
||||
|
||||
static const MountPoint mount_table[] = {
|
||||
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, true, true, false },
|
||||
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true, false }, /* Bind mount first ...*/
|
||||
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, true, true, true }, /* (except for this) */
|
||||
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true, false }, /* ... then, make it r/o */
|
||||
{ "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, false, true, false }, /* Bind mount first ...*/
|
||||
{ NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, true, false }, /* ... then, make it r/o */
|
||||
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, true },
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false, false },
|
||||
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false, false },
|
||||
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
|
||||
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false, false },
|
||||
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, true, true, false },
|
||||
/* inner child mounts */
|
||||
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
|
||||
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ...*/
|
||||
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
|
||||
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
||||
{ "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ...*/
|
||||
{ NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
||||
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_STRICTATIME, MOUNT_FATAL|MOUNT_IN_USERNS },
|
||||
|
||||
/* outer child mounts */
|
||||
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
|
||||
|
||||
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
|
||||
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
||||
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
||||
#ifdef HAVE_SELINUX
|
||||
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, false, false, false }, /* Bind mount first */
|
||||
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, false, false, false }, /* Then, make it r/o */
|
||||
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
|
||||
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
|
||||
#endif
|
||||
};
|
||||
|
||||
unsigned k;
|
||||
int r;
|
||||
bool use_userns = (mount_settings & MOUNT_USE_USERNS);
|
||||
bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
|
||||
bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
|
||||
bool in_userns = (mount_settings & MOUNT_IN_USERNS);
|
||||
|
||||
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
|
||||
_cleanup_free_ char *where = NULL, *options = NULL;
|
||||
const char *o;
|
||||
bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
|
||||
|
||||
if (in_userns != mount_table[k].in_userns)
|
||||
if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
|
||||
continue;
|
||||
|
||||
if (!use_netns && mount_table[k].use_netns)
|
||||
if (!netns && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_NETNS))
|
||||
continue;
|
||||
|
||||
if (!ro && (bool)(mount_table[k].mount_settings & MOUNT_APPLY_APIVFS_RO))
|
||||
continue;
|
||||
|
||||
where = prefix_root(dest, mount_table[k].where);
|
||||
@ -410,7 +424,7 @@ int mount_all(const char *dest,
|
||||
|
||||
r = mkdir_userns_p(dest, where, 0755, in_userns, uid_shift);
|
||||
if (r < 0 && r != -EEXIST) {
|
||||
if (mount_table[k].fatal)
|
||||
if (fatal)
|
||||
return log_error_errno(r, "Failed to create directory %s: %m", where);
|
||||
|
||||
log_debug_errno(r, "Failed to create directory %s: %m", where);
|
||||
@ -429,13 +443,13 @@ int mount_all(const char *dest,
|
||||
o = options;
|
||||
}
|
||||
|
||||
r = mount_verbose(mount_table[k].fatal ? LOG_ERR : LOG_DEBUG,
|
||||
r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
|
||||
mount_table[k].what,
|
||||
where,
|
||||
mount_table[k].type,
|
||||
mount_table[k].flags,
|
||||
o);
|
||||
if (r < 0 && mount_table[k].fatal)
|
||||
if (r < 0 && fatal)
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -23,6 +23,15 @@
|
||||
|
||||
#include "cgroup-util.h"
|
||||
|
||||
typedef enum MountSettingsMask {
|
||||
MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */
|
||||
MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
|
||||
MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
|
||||
MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
|
||||
MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
|
||||
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
|
||||
} MountSettingsMask;
|
||||
|
||||
typedef enum VolatileMode {
|
||||
VOLATILE_NO,
|
||||
VOLATILE_YES,
|
||||
@ -57,8 +66,8 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
|
||||
|
||||
int custom_mount_compare(const void *a, const void *b);
|
||||
|
||||
int mount_all(const char *dest, bool use_userns, bool in_userns, bool use_netns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
|
||||
int mount_sysfs(const char *dest);
|
||||
int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
|
||||
int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
|
||||
|
||||
int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
|
||||
int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
|
||||
|
@ -195,6 +195,7 @@ static const char *arg_container_service_name = "systemd-nspawn";
|
||||
static bool arg_notify_ready = false;
|
||||
static bool arg_use_cgns = true;
|
||||
static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
|
||||
static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
|
||||
|
||||
static void help(void) {
|
||||
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
|
||||
@ -378,6 +379,31 @@ static void parse_share_ns_env(const char *name, unsigned long ns_flag) {
|
||||
arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
|
||||
}
|
||||
|
||||
static void parse_mount_settings_env(void) {
|
||||
int r;
|
||||
const char *e;
|
||||
|
||||
e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
|
||||
if (!e)
|
||||
return;
|
||||
|
||||
if (streq(e, "network")) {
|
||||
arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
|
||||
return;
|
||||
}
|
||||
|
||||
r = parse_boolean(e);
|
||||
if (r < 0) {
|
||||
log_warning_errno(r, "Failed to parse SYSTEMD_NSPAWN_API_VFS_WRITABLE from environment, ignoring.");
|
||||
return;
|
||||
} else if (r > 0)
|
||||
arg_mount_settings &= ~MOUNT_APPLY_APIVFS_RO;
|
||||
else
|
||||
arg_mount_settings |= MOUNT_APPLY_APIVFS_RO;
|
||||
|
||||
arg_mount_settings &= ~MOUNT_APPLY_APIVFS_NETNS;
|
||||
}
|
||||
|
||||
static int parse_argv(int argc, char *argv[]) {
|
||||
|
||||
enum {
|
||||
@ -1070,6 +1096,14 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
|
||||
parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
|
||||
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO)
|
||||
arg_mount_settings |= MOUNT_USE_USERNS;
|
||||
|
||||
if (arg_private_network)
|
||||
arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
|
||||
|
||||
parse_mount_settings_env();
|
||||
|
||||
if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
|
||||
!(arg_clone_ns_flags & CLONE_NEWUTS)) {
|
||||
arg_register = false;
|
||||
@ -1164,6 +1198,15 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
static int verify_arguments(void) {
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network) {
|
||||
log_error("Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO)) {
|
||||
log_error("Cannot combine --private-users with read-write mounts.");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (arg_volatile_mode != VOLATILE_NO && arg_read_only) {
|
||||
log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
|
||||
@ -2700,9 +2743,7 @@ static int inner_child(
|
||||
return log_error_errno(r, "Couldn't become new root: %m");
|
||||
|
||||
r = mount_all(NULL,
|
||||
arg_userns_mode != USER_NAMESPACE_NO,
|
||||
true,
|
||||
arg_private_network,
|
||||
arg_mount_settings | MOUNT_IN_USERNS,
|
||||
arg_uid_shift,
|
||||
arg_uid_range,
|
||||
arg_selinux_apifs_context);
|
||||
@ -2710,7 +2751,7 @@ static int inner_child(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = mount_sysfs(NULL);
|
||||
r = mount_sysfs(NULL, arg_mount_settings);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -3077,9 +3118,7 @@ static int outer_child(
|
||||
}
|
||||
|
||||
r = mount_all(directory,
|
||||
arg_userns_mode != USER_NAMESPACE_NO,
|
||||
false,
|
||||
arg_private_network,
|
||||
arg_mount_settings,
|
||||
arg_uid_shift,
|
||||
arg_uid_range,
|
||||
arg_selinux_apifs_context);
|
||||
|
Loading…
Reference in New Issue
Block a user