mirror of
https://github.com/systemd/systemd.git
synced 2024-11-06 16:59:03 +03:00
nspawn: lock down a few things in /proc by default
This tightens security on /proc: a couple of files exposed there are now made inaccessible. These files might potentially leak kernel internals or expose non-virtualized concepts, hence lock them down by default. Moreover, a couple of dirs in /proc that expose stuff also exposed in /sys are now marked read-only, similar to how we handle /sys. The list is taken from what docker/runc based container managers generally apply, but slightly extended.
This commit is contained in:
parent
10af01a5ff
commit
d4b653c589
@ -508,6 +508,18 @@ int mount_all(const char *dest,
|
|||||||
uid_t uid_shift, uid_t uid_range,
|
uid_t uid_shift, uid_t uid_range,
|
||||||
const char *selinux_apifs_context) {
|
const char *selinux_apifs_context) {
|
||||||
|
|
||||||
|
#define PROC_INACCESSIBLE(path) \
|
||||||
|
{ NULL, (path), NULL, NULL, MS_BIND, \
|
||||||
|
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
|
||||||
|
{ NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
|
||||||
|
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
|
||||||
|
|
||||||
|
#define PROC_READ_ONLY(path) \
|
||||||
|
{ (path), (path), NULL, NULL, MS_BIND, \
|
||||||
|
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
|
||||||
|
{ NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
|
||||||
|
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
|
||||||
|
|
||||||
typedef struct MountPoint {
|
typedef struct MountPoint {
|
||||||
const char *what;
|
const char *what;
|
||||||
const char *where;
|
const char *where;
|
||||||
@ -518,39 +530,72 @@ int mount_all(const char *dest,
|
|||||||
} MountPoint;
|
} MountPoint;
|
||||||
|
|
||||||
static const MountPoint mount_table[] = {
|
static const MountPoint mount_table[] = {
|
||||||
/* inner child mounts */
|
/* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
|
||||||
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
|
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||||
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
|
MOUNT_FATAL|MOUNT_IN_USERNS },
|
||||||
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
|
|
||||||
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
|
||||||
{ "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
|
|
||||||
{ NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
|
||||||
|
|
||||||
/* outer child mounts */
|
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
|
||||||
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
|
||||||
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
|
|
||||||
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
|
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
|
||||||
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
|
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
|
||||||
|
|
||||||
|
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
|
||||||
|
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
||||||
|
|
||||||
|
/* Make these files inaccessible to container payloads: they potentially leak information about kernel
|
||||||
|
* internals or the host's execution environment to the container */
|
||||||
|
PROC_INACCESSIBLE("/proc/kallsyms"),
|
||||||
|
PROC_INACCESSIBLE("/proc/kcore"),
|
||||||
|
PROC_INACCESSIBLE("/proc/keys"),
|
||||||
|
PROC_INACCESSIBLE("/proc/sysrq-trigger"),
|
||||||
|
PROC_INACCESSIBLE("/proc/timer_list"),
|
||||||
|
|
||||||
|
/* Make these directories read-only to container payloads: they show hardware information, and in some
|
||||||
|
* cases contain tunables the container really shouldn't have access to. */
|
||||||
|
PROC_READ_ONLY("/proc/acpi"),
|
||||||
|
PROC_READ_ONLY("/proc/apm"),
|
||||||
|
PROC_READ_ONLY("/proc/asound"),
|
||||||
|
PROC_READ_ONLY("/proc/bus"),
|
||||||
|
PROC_READ_ONLY("/proc/fs"),
|
||||||
|
PROC_READ_ONLY("/proc/irq"),
|
||||||
|
PROC_READ_ONLY("/proc/scsi"),
|
||||||
|
|
||||||
|
/* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
|
||||||
|
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
|
||||||
|
MOUNT_FATAL },
|
||||||
|
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||||
|
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
|
||||||
|
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||||
|
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
|
||||||
|
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||||
|
MOUNT_FATAL }, /* skipped if above was mounted */
|
||||||
|
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
|
||||||
|
MOUNT_FATAL },
|
||||||
|
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
|
||||||
|
MOUNT_FATAL },
|
||||||
|
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
|
||||||
|
MOUNT_FATAL },
|
||||||
|
|
||||||
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
|
|
||||||
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
|
||||||
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
|
||||||
#if HAVE_SELINUX
|
#if HAVE_SELINUX
|
||||||
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
|
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
|
||||||
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
|
0 }, /* Bind mount first */
|
||||||
|
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
|
||||||
|
0 }, /* Then, make it r/o */
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
size_t k;
|
_cleanup_(unlink_and_freep) char *inaccessible = NULL;
|
||||||
bool use_userns = (mount_settings & MOUNT_USE_USERNS);
|
bool use_userns = (mount_settings & MOUNT_USE_USERNS);
|
||||||
bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
|
bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
|
||||||
bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
|
bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
|
||||||
bool in_userns = (mount_settings & MOUNT_IN_USERNS);
|
bool in_userns = (mount_settings & MOUNT_IN_USERNS);
|
||||||
|
size_t k;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
|
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
|
||||||
_cleanup_free_ char *where = NULL, *options = NULL;
|
_cleanup_free_ char *where = NULL, *options = NULL;
|
||||||
const char *o;
|
const char *o, *what;
|
||||||
bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
|
bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
|
||||||
|
|
||||||
if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
|
if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
|
||||||
@ -566,12 +611,32 @@ int mount_all(const char *dest,
|
|||||||
if (r < 0)
|
if (r < 0)
|
||||||
return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
|
return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
|
||||||
|
|
||||||
|
if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
|
||||||
|
|
||||||
|
if (!inaccessible) {
|
||||||
|
_cleanup_free_ char *np = NULL;
|
||||||
|
|
||||||
|
r = tempfn_random_child(NULL, "inaccessible", &np);
|
||||||
|
if (r < 0)
|
||||||
|
return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
|
||||||
|
|
||||||
|
r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
|
||||||
|
if (r < 0)
|
||||||
|
return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
|
||||||
|
|
||||||
|
inaccessible = TAKE_PTR(np);
|
||||||
|
}
|
||||||
|
|
||||||
|
what = inaccessible;
|
||||||
|
} else
|
||||||
|
what = mount_table[k].what;
|
||||||
|
|
||||||
r = path_is_mount_point(where, NULL, 0);
|
r = path_is_mount_point(where, NULL, 0);
|
||||||
if (r < 0 && r != -ENOENT)
|
if (r < 0 && r != -ENOENT)
|
||||||
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
|
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
|
||||||
|
|
||||||
/* Skip this entry if it is not a remount. */
|
/* Skip this entry if it is not a remount. */
|
||||||
if (mount_table[k].what && r > 0)
|
if (what && r > 0)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
|
r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
|
||||||
@ -600,7 +665,7 @@ int mount_all(const char *dest,
|
|||||||
}
|
}
|
||||||
|
|
||||||
r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
|
r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
|
||||||
mount_table[k].what,
|
what,
|
||||||
where,
|
where,
|
||||||
mount_table[k].type,
|
mount_table[k].type,
|
||||||
mount_table[k].flags,
|
mount_table[k].flags,
|
||||||
|
@ -13,12 +13,13 @@
|
|||||||
#include "volatile-util.h"
|
#include "volatile-util.h"
|
||||||
|
|
||||||
typedef enum MountSettingsMask {
|
typedef enum MountSettingsMask {
|
||||||
MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */
|
MOUNT_FATAL = 1U << 0, /* if set, a mount error is considered fatal */
|
||||||
MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
|
MOUNT_USE_USERNS = 1U << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
|
||||||
MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
|
MOUNT_IN_USERNS = 1U << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
|
||||||
MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
|
MOUNT_APPLY_APIVFS_RO = 1U << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */
|
||||||
MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
|
MOUNT_APPLY_APIVFS_NETNS = 1U << 4, /* if set, /proc/sys/net will be mounted read-write.
|
||||||
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
|
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
|
||||||
|
MOUNT_INACCESSIBLE_REG = 1U << 5, /* if set, create an inaccessible regular file first and use as bind mount source */
|
||||||
} MountSettingsMask;
|
} MountSettingsMask;
|
||||||
|
|
||||||
typedef enum CustomMountType {
|
typedef enum CustomMountType {
|
||||||
|
Loading…
Reference in New Issue
Block a user