mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-03-12 08:58:20 +03:00
Merge pull request #8898 from poettering/nspawn-mount-block
some nspawn cgroup and mount lock-down fixes
This commit is contained in:
commit
6b1ca2a948
7
TODO
7
TODO
@ -24,6 +24,10 @@ Janitorial Clean-ups:
|
||||
|
||||
Features:
|
||||
|
||||
* nspawn: greater control over hostname, resolv.conf, timezone, rlim
|
||||
|
||||
* nspawn: when operating in a scope, also create /payload subcrgoup
|
||||
|
||||
* the error paths in usbffs_dispatch_ep() leak memory
|
||||
|
||||
* cgroups: figure out if we can somehow communicate in a cleaner way whether a
|
||||
@ -52,9 +56,6 @@ Features:
|
||||
|
||||
* add --vacuum-xyz options to coredumpctl, matching those journalctl already has.
|
||||
|
||||
* list the exit codes from the BSD/glibc <sysexits.h> in our own
|
||||
exit-codes.[ch] tables.
|
||||
|
||||
* SuccessExitStatus= and friends should probably also accept symbolic exit
|
||||
codes names, i.e. error codes from the list maintained in exit-codes.[ch]
|
||||
|
||||
|
@ -424,15 +424,16 @@ unified you (of course, I guess) need to provide only `/sys/fs/cgroup/` itself.
|
||||
cgroup tree of systemd itself is out of limits for you. It's fine to *read*
|
||||
from any attribute you like however. That's totally OK and welcome.
|
||||
|
||||
4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a container
|
||||
payload running systemd, then don't get the idea that you can bind mount
|
||||
only a sub-tree of the host's cgroup tree into the container. Part of the
|
||||
cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
|
||||
4. 🚫 When not using `CLONE_NEWCGROUP` when delegating a sub-tree to a
|
||||
container payload running systemd, then don't get the idea that you can bind
|
||||
mount only a sub-tree of the host's cgroup tree into the container. Part of
|
||||
the cgroup API is that `/proc/$PID/cgroup` reports the cgroup path of every
|
||||
process, and hence any path below `/sys/fs/cgroup/` needs to match what
|
||||
`/proc/$PID/cgroup` of the payload processes reports. What you can do safely
|
||||
however, is mount the upper parts of the cgroup tree read-only or even
|
||||
replace it with an intermediary `tmpfs`, as long as the path to the
|
||||
delegated sub-tree remains accessible as-is.
|
||||
however, is mount the upper parts of the cgroup tree read-only (or even
|
||||
replace the middle bits with an intermediary `tmpfs` — but be careful not to
|
||||
break the `statfs()` detection logic discussed above), as long as the path
|
||||
to the delegated sub-tree remains accessible as-is.
|
||||
|
||||
5. ⚡ Currently, the algorithm for mapping between slice/scope/service unit
|
||||
naming and their cgroup paths is not considered public API of systemd, and
|
||||
|
@ -81,23 +81,26 @@ static const MountEntry apivfs_table[] = {
|
||||
|
||||
/* ProtectKernelTunables= option and the related filesystem APIs */
|
||||
static const MountEntry protect_kernel_tunables_table[] = {
|
||||
{ "/proc/sys", READONLY, false },
|
||||
{ "/proc/sysrq-trigger", READONLY, true },
|
||||
{ "/proc/latency_stats", READONLY, true },
|
||||
{ "/proc/mtrr", READONLY, true },
|
||||
{ "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
|
||||
{ "/proc/acpi", READONLY, true },
|
||||
{ "/proc/timer_stats", READONLY, true },
|
||||
{ "/proc/apm", READONLY, true }, /* Obsolete API, there's no point in permitting access to this, ever */
|
||||
{ "/proc/asound", READONLY, true },
|
||||
{ "/proc/bus", READONLY, true },
|
||||
{ "/proc/fs", READONLY, true },
|
||||
{ "/proc/irq", READONLY, true },
|
||||
{ "/proc/kallsyms", INACCESSIBLE, true },
|
||||
{ "/proc/kcore", INACCESSIBLE, true },
|
||||
{ "/proc/latency_stats", READONLY, true },
|
||||
{ "/proc/mtrr", READONLY, true },
|
||||
{ "/proc/scsi", READONLY, true },
|
||||
{ "/proc/sys", READONLY, false },
|
||||
{ "/proc/sysrq-trigger", READONLY, true },
|
||||
{ "/proc/timer_stats", READONLY, true },
|
||||
{ "/sys", READONLY, false },
|
||||
{ "/sys/kernel/debug", READONLY, true },
|
||||
{ "/sys/kernel/tracing", READONLY, true },
|
||||
{ "/sys/fs/bpf", READONLY, true },
|
||||
{ "/sys/fs/cgroup", READWRITE, false }, /* READONLY is set by ProtectControlGroups= option */
|
||||
{ "/sys/fs/selinux", READWRITE, true },
|
||||
{ "/sys/kernel/debug", READONLY, true },
|
||||
{ "/sys/kernel/tracing", READONLY, true },
|
||||
};
|
||||
|
||||
/* ProtectKernelModules= option */
|
||||
|
@ -141,44 +141,53 @@ finish:
|
||||
return r;
|
||||
}
|
||||
|
||||
int create_subcgroup(pid_t pid, CGroupUnified unified_requested) {
|
||||
int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested) {
|
||||
_cleanup_free_ char *cgroup = NULL;
|
||||
const char *child;
|
||||
int r;
|
||||
CGroupMask supported;
|
||||
const char *payload;
|
||||
int r;
|
||||
|
||||
/* In the unified hierarchy inner nodes may only contain
|
||||
* subgroups, but not processes. Hence, if we running in the
|
||||
* unified hierarchy and the container does the same, and we
|
||||
* did not create a scope unit for the container move us and
|
||||
* the container into two separate subcgroups. */
|
||||
assert(pid > 1);
|
||||
|
||||
if (unified_requested == CGROUP_UNIFIED_NONE)
|
||||
return 0;
|
||||
|
||||
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine whether the systemd controller is unified: %m");
|
||||
if (r == 0)
|
||||
return 0;
|
||||
/* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
|
||||
* the unified hierarchy and the container does the same, and we did not create a scope unit for the container
|
||||
* move us and the container into two separate subcgroups.
|
||||
*
|
||||
* Moreover, container payloads such as systemd try to manage the cgroup they run in in full (i.e. including
|
||||
* its attributes), while the host systemd will only delegate cgroups for children of the cgroup created for a
|
||||
* delegation unit, instead of the cgroup itself. This means, if we'd pass on the cgroup allocated from the
|
||||
* host systemd directly to the payload, the host and payload systemd might fight for the cgroup
|
||||
* attributes. Hence, let's insert an intermediary cgroup to cover that case too.
|
||||
*
|
||||
* Note that we only bother with the main hierarchy here, not with any secondary ones. On the unified setup
|
||||
* that's fine because there's only one hiearchy anyway and controllers are enabled directly on it. On the
|
||||
* legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
|
||||
* do it. */
|
||||
|
||||
r = cg_mask_supported(&supported);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine supported controllers: %m");
|
||||
|
||||
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
|
||||
if (keep_unit)
|
||||
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
|
||||
else
|
||||
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to get our control group: %m");
|
||||
|
||||
child = strjoina(cgroup, "/payload");
|
||||
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
|
||||
payload = strjoina(cgroup, "/payload");
|
||||
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
|
||||
return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
|
||||
|
||||
child = strjoina(cgroup, "/supervisor");
|
||||
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
|
||||
if (keep_unit) {
|
||||
const char *supervisor;
|
||||
|
||||
supervisor = strjoina(cgroup, "/supervisor");
|
||||
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
|
||||
}
|
||||
|
||||
/* Try to enable as many controllers as possible for the new payload. */
|
||||
(void) cg_enable_everywhere(supported, supported, cgroup);
|
||||
|
@ -14,4 +14,4 @@
|
||||
|
||||
int chown_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
|
||||
int sync_cgroup(pid_t pid, CGroupUnified unified_requested, uid_t uid_shift);
|
||||
int create_subcgroup(pid_t pid, CGroupUnified unified_requested);
|
||||
int create_subcgroup(pid_t pid, bool keep_unit, CGroupUnified unified_requested);
|
||||
|
@ -27,7 +27,7 @@
|
||||
#include "user-util.h"
|
||||
#include "util.h"
|
||||
|
||||
CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
|
||||
CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
|
||||
CustomMount *c, *ret;
|
||||
|
||||
assert(l);
|
||||
@ -48,8 +48,8 @@ CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
void custom_mount_free_all(CustomMount *l, unsigned n) {
|
||||
unsigned i;
|
||||
void custom_mount_free_all(CustomMount *l, size_t n) {
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
CustomMount *m = l + i;
|
||||
@ -110,8 +110,8 @@ static char *resolve_source_path(const char *dest, const char *source) {
|
||||
return strdup(source);
|
||||
}
|
||||
|
||||
int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
|
||||
unsigned i;
|
||||
int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
|
||||
size_t i;
|
||||
int r;
|
||||
|
||||
/* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
|
||||
@ -133,8 +133,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
|
||||
if (!s)
|
||||
return log_oom();
|
||||
|
||||
free(m->source);
|
||||
m->source = s;
|
||||
free_and_replace(m->source, s);
|
||||
} else {
|
||||
/* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
|
||||
|
||||
@ -165,8 +164,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
|
||||
if (!s)
|
||||
return log_oom();
|
||||
|
||||
free(*j);
|
||||
*j = s;
|
||||
free_and_replace(*j, s);
|
||||
}
|
||||
|
||||
if (m->work_dir) {
|
||||
@ -176,8 +174,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
|
||||
if (!s)
|
||||
return log_oom();
|
||||
|
||||
free(m->work_dir);
|
||||
m->work_dir = s;
|
||||
free_and_replace(m->work_dir, s);
|
||||
} else {
|
||||
assert(m->source);
|
||||
|
||||
@ -193,7 +190,7 @@ int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
|
||||
int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
|
||||
_cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
|
||||
const char *p = s;
|
||||
CustomMount *m;
|
||||
@ -239,7 +236,7 @@ int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only
|
||||
return 0;
|
||||
}
|
||||
|
||||
int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
|
||||
int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
|
||||
_cleanup_free_ char *path = NULL, *opts = NULL;
|
||||
const char *p = s;
|
||||
CustomMount *m;
|
||||
@ -275,7 +272,7 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only) {
|
||||
int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
|
||||
_cleanup_free_ char *upper = NULL, *destination = NULL;
|
||||
_cleanup_strv_free_ char **lower = NULL;
|
||||
CustomMount *m;
|
||||
@ -511,6 +508,18 @@ int mount_all(const char *dest,
|
||||
uid_t uid_shift, uid_t uid_range,
|
||||
const char *selinux_apifs_context) {
|
||||
|
||||
#define PROC_INACCESSIBLE(path) \
|
||||
{ NULL, (path), NULL, NULL, MS_BIND, \
|
||||
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_INACCESSIBLE_REG }, /* Bind mount first ... */ \
|
||||
{ NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
|
||||
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
|
||||
|
||||
#define PROC_READ_ONLY(path) \
|
||||
{ (path), (path), NULL, NULL, MS_BIND, \
|
||||
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
|
||||
{ NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
|
||||
MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
|
||||
|
||||
typedef struct MountPoint {
|
||||
const char *what;
|
||||
const char *where;
|
||||
@ -521,39 +530,72 @@ int mount_all(const char *dest,
|
||||
} MountPoint;
|
||||
|
||||
static const MountPoint mount_table[] = {
|
||||
/* inner child mounts */
|
||||
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_IN_USERNS },
|
||||
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
|
||||
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
|
||||
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
||||
{ "/proc/sysrq-trigger", "/proc/sysrq-trigger", NULL, NULL, MS_BIND, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
|
||||
{ NULL, "/proc/sysrq-trigger", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
||||
/* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
|
||||
{ "proc", "/proc", "proc", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
MOUNT_FATAL|MOUNT_IN_USERNS },
|
||||
|
||||
/* outer child mounts */
|
||||
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
||||
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV, MOUNT_FATAL }, /* skipped if above was mounted */
|
||||
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND,
|
||||
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */
|
||||
|
||||
{ "/proc/sys/net", "/proc/sys/net", NULL, NULL, MS_BIND,
|
||||
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
|
||||
|
||||
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
|
||||
MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* ... then, make it r/o */
|
||||
|
||||
/* Make these files inaccessible to container payloads: they potentially leak information about kernel
|
||||
* internals or the host's execution environment to the container */
|
||||
PROC_INACCESSIBLE("/proc/kallsyms"),
|
||||
PROC_INACCESSIBLE("/proc/kcore"),
|
||||
PROC_INACCESSIBLE("/proc/keys"),
|
||||
PROC_INACCESSIBLE("/proc/sysrq-trigger"),
|
||||
PROC_INACCESSIBLE("/proc/timer_list"),
|
||||
|
||||
/* Make these directories read-only to container payloads: they show hardware information, and in some
|
||||
* cases contain tunables the container really shouldn't have access to. */
|
||||
PROC_READ_ONLY("/proc/acpi"),
|
||||
PROC_READ_ONLY("/proc/apm"),
|
||||
PROC_READ_ONLY("/proc/asound"),
|
||||
PROC_READ_ONLY("/proc/bus"),
|
||||
PROC_READ_ONLY("/proc/fs"),
|
||||
PROC_READ_ONLY("/proc/irq"),
|
||||
PROC_READ_ONLY("/proc/scsi"),
|
||||
|
||||
/* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
|
||||
{ "tmpfs", "/tmp", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
|
||||
MOUNT_FATAL },
|
||||
{ "tmpfs", "/sys", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS },
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO }, /* skipped if above was mounted */
|
||||
{ "sysfs", "/sys", "sysfs", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
|
||||
MOUNT_FATAL }, /* skipped if above was mounted */
|
||||
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,
|
||||
MOUNT_FATAL },
|
||||
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
|
||||
MOUNT_FATAL },
|
||||
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
|
||||
MOUNT_FATAL },
|
||||
|
||||
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, MOUNT_FATAL },
|
||||
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
||||
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, MOUNT_FATAL },
|
||||
#if HAVE_SELINUX
|
||||
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND, 0 }, /* Bind mount first */
|
||||
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, 0 }, /* Then, make it r/o */
|
||||
{ "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,
|
||||
0 }, /* Bind mount first */
|
||||
{ NULL, "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
|
||||
0 }, /* Then, make it r/o */
|
||||
#endif
|
||||
};
|
||||
|
||||
unsigned k;
|
||||
int r;
|
||||
_cleanup_(unlink_and_freep) char *inaccessible = NULL;
|
||||
bool use_userns = (mount_settings & MOUNT_USE_USERNS);
|
||||
bool netns = (mount_settings & MOUNT_APPLY_APIVFS_NETNS);
|
||||
bool ro = (mount_settings & MOUNT_APPLY_APIVFS_RO);
|
||||
bool in_userns = (mount_settings & MOUNT_IN_USERNS);
|
||||
size_t k;
|
||||
int r;
|
||||
|
||||
for (k = 0; k < ELEMENTSOF(mount_table); k++) {
|
||||
_cleanup_free_ char *where = NULL, *options = NULL;
|
||||
const char *o;
|
||||
const char *o, *what;
|
||||
bool fatal = (mount_table[k].mount_settings & MOUNT_FATAL);
|
||||
|
||||
if (in_userns != (bool)(mount_table[k].mount_settings & MOUNT_IN_USERNS))
|
||||
@ -569,12 +611,32 @@ int mount_all(const char *dest,
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
|
||||
|
||||
if (mount_table[k].mount_settings & MOUNT_INACCESSIBLE_REG) {
|
||||
|
||||
if (!inaccessible) {
|
||||
_cleanup_free_ char *np = NULL;
|
||||
|
||||
r = tempfn_random_child(NULL, "inaccessible", &np);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to generate inaccessible file node path: %m");
|
||||
|
||||
r = touch_file(np, false, USEC_INFINITY, UID_INVALID, GID_INVALID, 0000);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create inaccessible file node '%s': %m", np);
|
||||
|
||||
inaccessible = TAKE_PTR(np);
|
||||
}
|
||||
|
||||
what = inaccessible;
|
||||
} else
|
||||
what = mount_table[k].what;
|
||||
|
||||
r = path_is_mount_point(where, NULL, 0);
|
||||
if (r < 0 && r != -ENOENT)
|
||||
return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
|
||||
|
||||
/* Skip this entry if it is not a remount. */
|
||||
if (mount_table[k].what && r > 0)
|
||||
if (what && r > 0)
|
||||
continue;
|
||||
|
||||
r = mkdir_userns_p(dest, where, 0755, mount_settings, uid_shift);
|
||||
@ -603,7 +665,7 @@ int mount_all(const char *dest,
|
||||
}
|
||||
|
||||
r = mount_verbose(fatal ? LOG_ERR : LOG_DEBUG,
|
||||
mount_table[k].what,
|
||||
what,
|
||||
where,
|
||||
mount_table[k].type,
|
||||
mount_table[k].flags,
|
||||
@ -766,11 +828,11 @@ static int mount_overlay(const char *dest, CustomMount *m) {
|
||||
|
||||
int mount_custom(
|
||||
const char *dest,
|
||||
CustomMount *mounts, unsigned n,
|
||||
CustomMount *mounts, size_t n,
|
||||
bool userns, uid_t uid_shift, uid_t uid_range,
|
||||
const char *selinux_apifs_context) {
|
||||
|
||||
unsigned i;
|
||||
size_t i;
|
||||
int r;
|
||||
|
||||
assert(dest);
|
||||
|
@ -13,12 +13,13 @@
|
||||
#include "volatile-util.h"
|
||||
|
||||
typedef enum MountSettingsMask {
|
||||
MOUNT_FATAL = 1 << 0, /* if set, a mount error is considered fatal */
|
||||
MOUNT_USE_USERNS = 1 << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
|
||||
MOUNT_IN_USERNS = 1 << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
|
||||
MOUNT_APPLY_APIVFS_RO = 1 << 3, /* if set, /proc/sys, and /sysfs will be mounted read-only, otherwise read-write. */
|
||||
MOUNT_APPLY_APIVFS_NETNS = 1 << 4, /* if set, /proc/sys/net will be mounted read-write.
|
||||
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
|
||||
MOUNT_FATAL = 1U << 0, /* if set, a mount error is considered fatal */
|
||||
MOUNT_USE_USERNS = 1U << 1, /* if set, mounts are patched considering uid/gid shifts in a user namespace */
|
||||
MOUNT_IN_USERNS = 1U << 2, /* if set, the mount is executed in the inner child, otherwise in the outer child */
|
||||
MOUNT_APPLY_APIVFS_RO = 1U << 3, /* if set, /proc/sys, and /sys will be mounted read-only, otherwise read-write. */
|
||||
MOUNT_APPLY_APIVFS_NETNS = 1U << 4, /* if set, /proc/sys/net will be mounted read-write.
|
||||
Works only if MOUNT_APPLY_APIVFS_RO is also set. */
|
||||
MOUNT_INACCESSIBLE_REG = 1U << 5, /* if set, create an inaccessible regular file first and use as bind mount source */
|
||||
} MountSettingsMask;
|
||||
|
||||
typedef enum CustomMountType {
|
||||
@ -40,13 +41,13 @@ typedef struct CustomMount {
|
||||
char *rm_rf_tmpdir;
|
||||
} CustomMount;
|
||||
|
||||
CustomMount* custom_mount_add(CustomMount **l, unsigned *n, CustomMountType t);
|
||||
void custom_mount_free_all(CustomMount *l, unsigned n);
|
||||
int custom_mount_prepare_all(const char *dest, CustomMount *l, unsigned n);
|
||||
CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t);
|
||||
void custom_mount_free_all(CustomMount *l, size_t n);
|
||||
int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n);
|
||||
|
||||
int bind_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
|
||||
int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s);
|
||||
int overlay_mount_parse(CustomMount **l, unsigned *n, const char *s, bool read_only);
|
||||
int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
|
||||
int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s);
|
||||
int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only);
|
||||
|
||||
int mount_all(const char *dest, MountSettingsMask mount_settings, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
|
||||
int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
|
||||
@ -54,7 +55,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
|
||||
int mount_cgroups(const char *dest, CGroupUnified unified_requested, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
|
||||
int mount_systemd_cgroup_writable(const char *dest, CGroupUnified unified_requested);
|
||||
|
||||
int mount_custom(const char *dest, CustomMount *mounts, unsigned n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
|
||||
int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
|
||||
|
||||
int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
|
||||
int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "bus-unit-util.h"
|
||||
#include "bus-util.h"
|
||||
#include "nspawn-register.h"
|
||||
#include "special.h"
|
||||
#include "stat-util.h"
|
||||
#include "strv.h"
|
||||
#include "util.h"
|
||||
@ -309,7 +310,7 @@ int allocate_scope(
|
||||
"PIDs", "au", 1, pid,
|
||||
"Description", "s", description,
|
||||
"Delegate", "b", 1,
|
||||
"Slice", "s", isempty(slice) ? "machine.slice" : slice);
|
||||
"Slice", "s", isempty(slice) ? SPECIAL_MACHINE_SLICE : slice);
|
||||
if (r < 0)
|
||||
return bus_log_create_error(r);
|
||||
|
||||
|
@ -76,7 +76,7 @@ typedef struct Settings {
|
||||
int read_only;
|
||||
VolatileMode volatile_mode;
|
||||
CustomMount *custom_mounts;
|
||||
unsigned n_custom_mounts;
|
||||
size_t n_custom_mounts;
|
||||
int userns_chown;
|
||||
|
||||
/* [Network] */
|
||||
|
@ -165,7 +165,7 @@ static uint64_t arg_caps_retain =
|
||||
(1ULL << CAP_SYS_RESOURCE) |
|
||||
(1ULL << CAP_SYS_TTY_CONFIG);
|
||||
static CustomMount *arg_custom_mounts = NULL;
|
||||
static unsigned arg_n_custom_mounts = 0;
|
||||
static size_t arg_n_custom_mounts = 0;
|
||||
static char **arg_setenv = NULL;
|
||||
static bool arg_quiet = false;
|
||||
static bool arg_register = true;
|
||||
@ -291,7 +291,7 @@ static void help(void) {
|
||||
}
|
||||
|
||||
static int custom_mount_check_all(void) {
|
||||
unsigned i;
|
||||
size_t i;
|
||||
|
||||
for (i = 0; i < arg_n_custom_mounts; i++) {
|
||||
CustomMount *m = &arg_custom_mounts[i];
|
||||
@ -1470,31 +1470,35 @@ static int setup_resolv_conf(const char *dest) {
|
||||
}
|
||||
|
||||
static int setup_boot_id(void) {
|
||||
_cleanup_(unlink_and_freep) char *from = NULL;
|
||||
_cleanup_free_ char *path = NULL;
|
||||
sd_id128_t rnd = SD_ID128_NULL;
|
||||
const char *from, *to;
|
||||
const char *to;
|
||||
int r;
|
||||
|
||||
/* Generate a new randomized boot ID, so that each boot-up of
|
||||
* the container gets a new one */
|
||||
|
||||
from = "/run/proc-sys-kernel-random-boot-id";
|
||||
to = "/proc/sys/kernel/random/boot_id";
|
||||
r = tempfn_random_child(NULL, "proc-sys-kernel-random-boot-id", &path);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to generate random boot ID path: %m");
|
||||
|
||||
r = sd_id128_randomize(&rnd);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to generate random boot id: %m");
|
||||
|
||||
r = id128_write(from, ID128_UUID, rnd, false);
|
||||
r = id128_write(path, ID128_UUID, rnd, false);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to write boot id: %m");
|
||||
|
||||
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
|
||||
if (r >= 0)
|
||||
r = mount_verbose(LOG_ERR, NULL, to, NULL,
|
||||
MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
|
||||
from = TAKE_PTR(path);
|
||||
to = "/proc/sys/kernel/random/boot_id";
|
||||
|
||||
(void) unlink(from);
|
||||
return r;
|
||||
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
return mount_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
|
||||
}
|
||||
|
||||
static int copy_devnodes(const char *dest) {
|
||||
@ -1662,26 +1666,32 @@ static int setup_keyring(void) {
|
||||
}
|
||||
|
||||
static int setup_kmsg(int kmsg_socket) {
|
||||
const char *from, *to;
|
||||
_cleanup_(unlink_and_freep) char *from = NULL;
|
||||
_cleanup_free_ char *fifo = NULL;
|
||||
_cleanup_close_ int fd = -1;
|
||||
_cleanup_umask_ mode_t u;
|
||||
int fd, r;
|
||||
const char *to;
|
||||
int r;
|
||||
|
||||
assert(kmsg_socket >= 0);
|
||||
|
||||
u = umask(0000);
|
||||
|
||||
/* We create the kmsg FIFO as /run/kmsg, but immediately
|
||||
* delete it after bind mounting it to /proc/kmsg. While FIFOs
|
||||
* on the reading side behave very similar to /proc/kmsg,
|
||||
* their writing side behaves differently from /dev/kmsg in
|
||||
* that writing blocks when nothing is reading. In order to
|
||||
* avoid any problems with containers deadlocking due to this
|
||||
* we simply make /dev/kmsg unavailable to the container. */
|
||||
from = "/run/kmsg";
|
||||
/* We create the kmsg FIFO as as temporary file in /tmp, but immediately delete it after bind mounting it to
|
||||
* /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
|
||||
* differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
|
||||
* with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
|
||||
|
||||
r = tempfn_random_child(NULL, "proc-kmsg", &fifo);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to generate kmsg path: %m");
|
||||
|
||||
if (mkfifo(fifo, 0600) < 0)
|
||||
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
|
||||
|
||||
from = TAKE_PTR(fifo);
|
||||
to = "/proc/kmsg";
|
||||
|
||||
if (mkfifo(from, 0600) < 0)
|
||||
return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
|
||||
r = mount_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -1690,17 +1700,11 @@ static int setup_kmsg(int kmsg_socket) {
|
||||
if (fd < 0)
|
||||
return log_error_errno(errno, "Failed to open fifo: %m");
|
||||
|
||||
/* Store away the fd in the socket, so that it stays open as
|
||||
* long as we run the child */
|
||||
/* Store away the fd in the socket, so that it stays open as long as we run the child */
|
||||
r = send_one_fd(kmsg_socket, fd, 0);
|
||||
safe_close(fd);
|
||||
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to send FIFO fd: %m");
|
||||
|
||||
/* And now make the FIFO unavailable as /run/kmsg... */
|
||||
(void) unlink(from);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2265,7 +2269,7 @@ static int inner_child(
|
||||
|
||||
_cleanup_free_ char *home = NULL;
|
||||
char as_uuid[37];
|
||||
unsigned n_env = 1;
|
||||
size_t n_env = 1;
|
||||
const char *envp[] = {
|
||||
"PATH=" DEFAULT_PATH_COMPAT,
|
||||
NULL, /* container */
|
||||
@ -3639,11 +3643,9 @@ static int run(int master,
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
if (arg_keep_unit) {
|
||||
r = create_subcgroup(*pid, arg_unified_cgroup_hierarchy);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
|
||||
if (r < 0)
|
||||
|
Loading…
x
Reference in New Issue
Block a user