mirror of
https://github.com/systemd/systemd.git
synced 2025-03-25 18:50:18 +03:00
Merge pull request #3589 from brauner/cgroup_namespace
Cgroup namespace
This commit is contained in:
commit
1a0b98c437
@ -134,6 +134,20 @@ int cg_read_event(const char *controller, const char *path, const char *event,
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
bool cg_ns_supported(void) {
|
||||
static thread_local int enabled = -1;
|
||||
|
||||
if (enabled >= 0)
|
||||
return enabled;
|
||||
|
||||
if (access("/proc/self/ns/cgroup", F_OK) == 0)
|
||||
enabled = 1;
|
||||
else
|
||||
enabled = 0;
|
||||
|
||||
return enabled;
|
||||
}
|
||||
|
||||
int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
|
||||
_cleanup_free_ char *fs = NULL;
|
||||
int r;
|
||||
|
@ -222,6 +222,8 @@ int cg_mask_supported(CGroupMask *ret);
|
||||
|
||||
int cg_kernel_controllers(Set *controllers);
|
||||
|
||||
bool cg_ns_supported(void);
|
||||
|
||||
int cg_unified(void);
|
||||
void cg_unified_flush(void);
|
||||
|
||||
|
@ -445,6 +445,10 @@ struct btrfs_ioctl_quota_ctl_args {
|
||||
#define CGROUP2_SUPER_MAGIC 0x63677270
|
||||
#endif
|
||||
|
||||
#ifndef CLONE_NEWCGROUP
|
||||
#define CLONE_NEWCGROUP 0x02000000
|
||||
#endif
|
||||
|
||||
#ifndef TMPFS_MAGIC
|
||||
#define TMPFS_MAGIC 0x01021994
|
||||
#endif
|
||||
|
@ -23,6 +23,8 @@
|
||||
#include "alloc-util.h"
|
||||
#include "cgroup-util.h"
|
||||
#include "escape.h"
|
||||
#include "fd-util.h"
|
||||
#include "fileio.h"
|
||||
#include "fs-util.h"
|
||||
#include "label.h"
|
||||
#include "mkdir.h"
|
||||
@ -181,13 +183,15 @@ int tmpfs_mount_parse(CustomMount **l, unsigned *n, const char *s) {
|
||||
|
||||
static int tmpfs_patch_options(
|
||||
const char *options,
|
||||
bool userns, uid_t uid_shift, uid_t uid_range,
|
||||
bool userns,
|
||||
uid_t uid_shift, uid_t uid_range,
|
||||
bool patch_ids,
|
||||
const char *selinux_apifs_context,
|
||||
char **ret) {
|
||||
|
||||
char *buf = NULL;
|
||||
|
||||
if (userns && uid_shift != 0) {
|
||||
if ((userns && uid_shift != 0) || patch_ids) {
|
||||
assert(uid_shift != UID_INVALID);
|
||||
|
||||
if (options)
|
||||
@ -218,7 +222,13 @@ static int tmpfs_patch_options(
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!buf && options) {
|
||||
buf = strdup(options);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
}
|
||||
*ret = buf;
|
||||
|
||||
return !!buf;
|
||||
}
|
||||
|
||||
@ -271,7 +281,15 @@ int mount_sysfs(const char *dest) {
|
||||
return log_error_errno(errno, "Failed to remove %s: %m", full);
|
||||
|
||||
x = prefix_roota(top, "/fs/kdbus");
|
||||
(void) mkdir(x, 0755);
|
||||
(void) mkdir_p(x, 0755);
|
||||
|
||||
/* Create mountpoint for cgroups. Otherwise we are not allowed since we
|
||||
* remount /sys read-only.
|
||||
*/
|
||||
if (cg_ns_supported()) {
|
||||
x = prefix_roota(top, "/fs/cgroup");
|
||||
(void) mkdir_p(x, 0755);
|
||||
}
|
||||
|
||||
if (mount(NULL, top, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, NULL) < 0)
|
||||
return log_error_errno(errno, "Failed to make %s read-only: %m", top);
|
||||
@ -349,7 +367,7 @@ int mount_all(const char *dest,
|
||||
|
||||
o = mount_table[k].options;
|
||||
if (streq_ptr(mount_table[k].type, "tmpfs")) {
|
||||
r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, selinux_apifs_context, &options);
|
||||
r = tmpfs_patch_options(o, use_userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
if (r > 0)
|
||||
@ -486,7 +504,7 @@ static int mount_tmpfs(
|
||||
if (r < 0 && r != -EEXIST)
|
||||
return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
|
||||
|
||||
r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
|
||||
r = tmpfs_patch_options(m->options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
options = r > 0 ? buf : m->options;
|
||||
@ -601,6 +619,48 @@ int mount_custom(
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Retrieve existing subsystems. This function is called in a new cgroup
|
||||
* namespace.
|
||||
*/
|
||||
static int get_controllers(Set *subsystems) {
|
||||
_cleanup_fclose_ FILE *f = NULL;
|
||||
char line[LINE_MAX];
|
||||
|
||||
assert(subsystems);
|
||||
|
||||
f = fopen("/proc/self/cgroup", "re");
|
||||
if (!f)
|
||||
return errno == ENOENT ? -ESRCH : -errno;
|
||||
|
||||
FOREACH_LINE(line, f, return -errno) {
|
||||
int r;
|
||||
char *e, *l, *p;
|
||||
|
||||
truncate_nl(line);
|
||||
|
||||
l = strchr(line, ':');
|
||||
if (!l)
|
||||
continue;
|
||||
|
||||
l++;
|
||||
e = strchr(l, ':');
|
||||
if (!e)
|
||||
continue;
|
||||
|
||||
*e = 0;
|
||||
|
||||
if (streq(l, "") || streq(l, "name=systemd"))
|
||||
continue;
|
||||
|
||||
p = strdup(l);
|
||||
r = set_consume(subsystems, p);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
|
||||
char *to;
|
||||
int r;
|
||||
@ -629,11 +689,107 @@ static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controlle
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int mount_legacy_cgroups(
|
||||
/* Mount a legacy cgroup hierarchy when cgroup namespaces are supported. */
|
||||
static int mount_legacy_cgns_supported(
|
||||
bool userns, uid_t uid_shift, uid_t uid_range,
|
||||
const char *selinux_apifs_context) {
|
||||
_cleanup_set_free_free_ Set *controllers = NULL;
|
||||
const char *cgroup_root = "/sys/fs/cgroup", *c;
|
||||
int r;
|
||||
|
||||
(void) mkdir_p(cgroup_root, 0755);
|
||||
|
||||
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
|
||||
r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
|
||||
if (r == 0) {
|
||||
_cleanup_free_ char *options = NULL;
|
||||
|
||||
/* When cgroup namespaces are enabled and user namespaces are
|
||||
* used then the mount of the cgroupfs is done *inside* the new
|
||||
* user namespace. We're root in the new user namespace and the
|
||||
* kernel will happily translate our uid/gid to the correct
|
||||
* uid/gid as seen from e.g. /proc/1/mountinfo. So we simply
|
||||
* pass uid 0 and not uid_shift to tmpfs_patch_options().
|
||||
*/
|
||||
r = tmpfs_patch_options("mode=755", userns, 0, uid_range, true, selinux_apifs_context, &options);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
|
||||
if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
|
||||
return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
|
||||
}
|
||||
|
||||
if (cg_unified() > 0)
|
||||
goto skip_controllers;
|
||||
|
||||
controllers = set_new(&string_hash_ops);
|
||||
if (!controllers)
|
||||
return log_oom();
|
||||
|
||||
r = get_controllers(controllers);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine cgroup controllers: %m");
|
||||
|
||||
for (;;) {
|
||||
_cleanup_free_ const char *controller = NULL;
|
||||
|
||||
controller = set_steal_first(controllers);
|
||||
if (!controller)
|
||||
break;
|
||||
|
||||
r = mount_legacy_cgroup_hierarchy("", controller, controller, !userns);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
/* When multiple hierarchies are co-mounted, make their
|
||||
* constituting individual hierarchies a symlink to the
|
||||
* co-mount.
|
||||
*/
|
||||
c = controller;
|
||||
for (;;) {
|
||||
_cleanup_free_ char *target = NULL, *tok = NULL;
|
||||
|
||||
r = extract_first_word(&c, &tok, ",", 0);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to extract co-mounted cgroup controller: %m");
|
||||
if (r == 0)
|
||||
break;
|
||||
|
||||
target = prefix_root("/sys/fs/cgroup", tok);
|
||||
if (!target)
|
||||
return log_oom();
|
||||
|
||||
if (streq(controller, tok))
|
||||
break;
|
||||
|
||||
r = symlink_idempotent(controller, target);
|
||||
if (r == -EINVAL)
|
||||
return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
|
||||
}
|
||||
}
|
||||
|
||||
skip_controllers:
|
||||
r = mount_legacy_cgroup_hierarchy("", "none,name=systemd,xattr", "systemd", false);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
if (!userns) {
|
||||
if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
|
||||
return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Mount legacy cgroup hierarchy when cgroup namespaces are unsupported. */
|
||||
static int mount_legacy_cgns_unsupported(
|
||||
const char *dest,
|
||||
bool userns, uid_t uid_shift, uid_t uid_range,
|
||||
const char *selinux_apifs_context) {
|
||||
|
||||
_cleanup_set_free_free_ Set *controllers = NULL;
|
||||
const char *cgroup_root;
|
||||
int r;
|
||||
@ -649,7 +805,7 @@ static int mount_legacy_cgroups(
|
||||
if (r == 0) {
|
||||
_cleanup_free_ char *options = NULL;
|
||||
|
||||
r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, selinux_apifs_context, &options);
|
||||
r = tmpfs_patch_options("mode=755", userns, uid_shift, uid_range, false, selinux_apifs_context, &options);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
|
||||
@ -708,10 +864,8 @@ static int mount_legacy_cgroups(
|
||||
return r;
|
||||
|
||||
r = symlink_idempotent(combined, target);
|
||||
if (r == -EINVAL) {
|
||||
log_error("Invalid existing symlink for combined hierarchy");
|
||||
return r;
|
||||
}
|
||||
if (r == -EINVAL)
|
||||
return log_error_errno(r, "Invalid existing symlink for combined hierarchy: %m");
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create symlink for combined hierarchy: %m");
|
||||
}
|
||||
@ -766,8 +920,10 @@ int mount_cgroups(
|
||||
|
||||
if (unified_requested)
|
||||
return mount_unified_cgroups(dest);
|
||||
else
|
||||
return mount_legacy_cgroups(dest, userns, uid_shift, uid_range, selinux_apifs_context);
|
||||
else if (cg_ns_supported())
|
||||
return mount_legacy_cgns_supported(userns, uid_shift, uid_range, selinux_apifs_context);
|
||||
|
||||
return mount_legacy_cgns_unsupported(dest, userns, uid_shift, uid_range, selinux_apifs_context);
|
||||
}
|
||||
|
||||
int mount_systemd_cgroup_writable(
|
||||
@ -835,7 +991,7 @@ int setup_volatile_state(
|
||||
return log_error_errno(errno, "Failed to create %s: %m", directory);
|
||||
|
||||
options = "mode=755";
|
||||
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
|
||||
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
if (r > 0)
|
||||
@ -871,7 +1027,7 @@ int setup_volatile(
|
||||
return log_error_errno(errno, "Failed to create temporary directory: %m");
|
||||
|
||||
options = "mode=755";
|
||||
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, selinux_apifs_context, &buf);
|
||||
r = tmpfs_patch_options(options, userns, uid_shift, uid_range, false, selinux_apifs_context, &buf);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
if (r > 0)
|
||||
|
@ -2589,9 +2589,24 @@ static int inner_child(
|
||||
return -ESRCH;
|
||||
}
|
||||
|
||||
r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (cg_ns_supported()) {
|
||||
r = unshare(CLONE_NEWCGROUP);
|
||||
if (r < 0)
|
||||
return log_error_errno(errno, "Failed to unshare cgroup namespace");
|
||||
r = mount_cgroups(
|
||||
"",
|
||||
arg_unified_cgroup_hierarchy,
|
||||
arg_userns_mode != USER_NAMESPACE_NO,
|
||||
arg_uid_shift,
|
||||
arg_uid_range,
|
||||
arg_selinux_apifs_context);
|
||||
if (r < 0)
|
||||
return r;
|
||||
} else {
|
||||
r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
r = reset_uid_gid();
|
||||
if (r < 0)
|
||||
@ -2973,15 +2988,17 @@ static int outer_child(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = mount_cgroups(
|
||||
directory,
|
||||
arg_unified_cgroup_hierarchy,
|
||||
arg_userns_mode != USER_NAMESPACE_NO,
|
||||
arg_uid_shift,
|
||||
arg_uid_range,
|
||||
arg_selinux_apifs_context);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (!cg_ns_supported()) {
|
||||
r = mount_cgroups(
|
||||
directory,
|
||||
arg_unified_cgroup_hierarchy,
|
||||
arg_userns_mode != USER_NAMESPACE_NO,
|
||||
arg_uid_shift,
|
||||
arg_uid_range,
|
||||
arg_selinux_apifs_context);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
r = mount_move_root(directory);
|
||||
if (r < 0)
|
||||
|
Loading…
x
Reference in New Issue
Block a user