1
0
mirror of https://github.com/systemd/systemd.git synced 2024-11-06 08:26:52 +03:00

core: unified cgroup hierarchy support

This patch set adds full support the new unified cgroup hierarchy logic
of modern kernels.

A new kernel command line option "systemd.unified_cgroup_hierarchy=1" is
added. If specified the unified hierarchy is mounted to /sys/fs/cgroup
instead of a tmpfs. No further hierarchies are mounted. The kernel
command line option defaults to off. We can turn it on by default as
soon as the kernel's APIs regarding this are stabilized (but even then
downstream distros might want to turn this off, as this will break any
tools that access cgroupfs directly).

It is possibly to choose for each boot individually whether the unified
or the legacy hierarchy is used. nspawn will by default provide the
legacy hierarchy to containers if the host is using it, and the unified
otherwise. However it is possible to run containers with the unified
hierarchy on a legacy host and vice versa, by setting the
$UNIFIED_CGROUP_HIERARCHY environment variable for nspawn to 1 or 0,
respectively.

The unified hierarchy provides reliable cgroup empty notifications for
the first time, via inotify. To make use of this we maintain one
manager-wide inotify fd, and each cgroup to it.

This patch also removes cg_delete() which is unused now.

On kernel 4.2 only the "memory" controller is compatible with the
unified hierarchy, hence that's the only controller systemd exposes when
booted in unified heirarchy mode.

This introduces a new enum for enumerating supported controllers, plus a
related enum for the mask bits mapping to it. The core is changed to
make use of this everywhere.

This moves PID 1 into a new "init.scope" implicit scope unit in the root
slice. This is necessary since on the unified hierarchy cgroups may
either contain subgroups or processes but not both. PID 1 hence has to
move out of the root cgroup (strictly speaking the root cgroup is the
only one where processes and subgroups are still allowed, but in order
to support containers nicey, we move PID 1 into the new scope in all
cases.) This new unit is also used on legacy hierarchy setups. It's
actually pretty useful on all systems, as it can then be used to filter
journal messages coming from PID 1, and so on.

The root slice ("-.slice") is now implicitly created and started (and
does not require a unit file on disk anymore), since
that's where "init.scope" is located and the slice needs to be started
before the scope can.

To check whether we are in unified or legacy hierarchy mode we use
statfs() on /sys/fs/cgroup. If the .f_type field reports tmpfs we are in
legacy mode, if it reports cgroupfs we are in unified mode.

This patch set carefuly makes sure that cgls and cgtop continue to work
as desired.

When invoking nspawn as a service it will implicitly create two
subcgroups in the cgroup it is using, one to move the nspawn process
into, the other to move the actual container processes into. This is
done because of the requirement that cgroups may either contain
processes or other subgroups.
This commit is contained in:
Lennart Poettering 2015-09-01 19:22:36 +02:00
parent 92dcf85e11
commit efdb02375b
23 changed files with 1497 additions and 595 deletions

File diff suppressed because it is too large Load Diff

View File

@ -28,15 +28,28 @@
#include "set.h"
#include "def.h"
/* An enum of well known cgroup controllers */
typedef enum CGroupController {
CGROUP_CONTROLLER_CPU,
CGROUP_CONTROLLER_CPUACCT,
CGROUP_CONTROLLER_BLKIO,
CGROUP_CONTROLLER_MEMORY,
CGROUP_CONTROLLER_DEVICE,
_CGROUP_CONTROLLER_MAX,
_CGROUP_CONTROLLER_INVALID = -1,
} CGroupController;
#define CGROUP_CONTROLLER_TO_MASK(c) (1 << (c))
/* A bit mask of well known cgroup controllers */
typedef enum CGroupControllerMask {
CGROUP_CPU = 1,
CGROUP_CPUACCT = 2,
CGROUP_BLKIO = 4,
CGROUP_MEMORY = 8,
CGROUP_DEVICE = 16,
_CGROUP_CONTROLLER_MASK_ALL = 31
} CGroupControllerMask;
typedef enum CGroupMask {
CGROUP_MASK_CPU = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPU),
CGROUP_MASK_CPUACCT = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_CPUACCT),
CGROUP_MASK_BLKIO = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_BLKIO),
CGROUP_MASK_MEMORY = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_MEMORY),
CGROUP_MASK_DEVICE = CGROUP_CONTROLLER_TO_MASK(CGROUP_CONTROLLER_DEVICE),
_CGROUP_MASK_ALL = CGROUP_CONTROLLER_TO_MASK(_CGROUP_CONTROLLER_MAX) - 1
} CGroupMask;
/*
* General rules:
@ -77,7 +90,6 @@ int cg_pid_get_path(const char *controller, pid_t pid, char **path);
int cg_trim(const char *controller, const char *path, bool delete_root);
int cg_rmdir(const char *controller, const char *path);
int cg_delete(const char *controller, const char *path);
int cg_create(const char *controller, const char *path);
int cg_attach(const char *controller, const char *path, pid_t pid);
@ -126,14 +138,24 @@ bool cg_controller_is_valid(const char *p);
int cg_slice_to_path(const char *unit, char **ret);
typedef const char* (*cg_migrate_callback_t)(CGroupControllerMask mask, void *userdata);
typedef const char* (*cg_migrate_callback_t)(CGroupMask mask, void *userdata);
int cg_create_everywhere(CGroupControllerMask supported, CGroupControllerMask mask, const char *path);
int cg_attach_everywhere(CGroupControllerMask supported, const char *path, pid_t pid, cg_migrate_callback_t callback, void *userdata);
int cg_attach_many_everywhere(CGroupControllerMask supported, const char *path, Set* pids, cg_migrate_callback_t callback, void *userdata);
int cg_migrate_everywhere(CGroupControllerMask supported, const char *from, const char *to, cg_migrate_callback_t callback, void *userdata);
int cg_trim_everywhere(CGroupControllerMask supported, const char *path, bool delete_root);
int cg_create_everywhere(CGroupMask supported, CGroupMask mask, const char *path);
int cg_attach_everywhere(CGroupMask supported, const char *path, pid_t pid, cg_migrate_callback_t callback, void *userdata);
int cg_attach_many_everywhere(CGroupMask supported, const char *path, Set* pids, cg_migrate_callback_t callback, void *userdata);
int cg_migrate_everywhere(CGroupMask supported, const char *from, const char *to, cg_migrate_callback_t callback, void *userdata);
int cg_trim_everywhere(CGroupMask supported, const char *path, bool delete_root);
int cg_enable_everywhere(CGroupMask supported, CGroupMask mask, const char *p);
CGroupControllerMask cg_mask_supported(void);
int cg_mask_supported(CGroupMask *ret);
int cg_kernel_controllers(Set *controllers);
int cg_unified(void);
void cg_unified_flush(void);
bool cg_is_unified_wanted(void);
bool cg_is_legacy_wanted(void);
const char* cgroup_controller_to_string(CGroupController c) _const_;
CGroupController cgroup_controller_from_string(const char *s) _pure_;

View File

@ -35,7 +35,7 @@
* the watchdog pings will keep the loop busy. */
#define DEFAULT_EXIT_USEC (30*USEC_PER_SEC)
#define SYSTEMD_CGROUP_CONTROLLER "systemd"
#define SYSTEMD_CGROUP_CONTROLLER "name=systemd"
#define SIGNALS_CRASH_HANDLER SIGSEGV,SIGILL,SIGFPE,SIGBUS,SIGQUIT,SIGABRT
#define SIGNALS_IGNORE SIGPIPE

View File

@ -492,6 +492,14 @@ struct btrfs_ioctl_quota_ctl_args {
#define BTRFS_SUPER_MAGIC 0x9123683E
#endif
#ifndef CGROUP_SUPER_MAGIC
#define CGROUP_SUPER_MAGIC 0x27e0eb
#endif
#ifndef TMPFS_MAGIC
#define TMPFS_MAGIC 0x01021994
#endif
#ifndef MS_MOVE
#define MS_MOVE 8192
#endif

View File

@ -115,3 +115,6 @@
#define SPECIAL_USER_SLICE "user.slice"
#define SPECIAL_MACHINE_SLICE "machine.slice"
#define SPECIAL_ROOT_SLICE "-.slice"
/* The scope unit systemd itself lives in. */
#define SPECIAL_INIT_SCOPE "init.scope"

View File

@ -225,7 +225,10 @@ int main(int argc, char *argv[]) {
} else
path = root;
printf("Controller %s; control group %s:\n", controller, path);
if (cg_unified() > 0)
printf("Control group %s:\n", path);
else
printf("Controller %s; control group %s:\n", controller, path);
fflush(stdout);
q = show_cgroup(controller, path, NULL, 0, arg_kernel_threads, output_flags);

View File

@ -175,7 +175,7 @@ static int process(
if (g->n_tasks > 0)
g->n_tasks_valid = true;
} else if (streq(controller, "cpuacct")) {
} else if (streq(controller, "cpuacct") && cg_unified() <= 0) {
_cleanup_free_ char *p = NULL, *v = NULL;
uint64_t new_usage;
nsec_t timestamp;
@ -217,7 +217,10 @@ static int process(
} else if (streq(controller, "memory")) {
_cleanup_free_ char *p = NULL, *v = NULL;
r = cg_get_path(controller, path, "memory.usage_in_bytes", &p);
if (cg_unified() <= 0)
r = cg_get_path(controller, path, "memory.usage_in_bytes", &p);
else
r = cg_get_path(controller, path, "memory.current", &p);
if (r < 0)
return r;
@ -234,7 +237,7 @@ static int process(
if (g->memory > 0)
g->memory_valid = true;
} else if (streq(controller, "blkio")) {
} else if (streq(controller, "blkio") && cg_unified() <= 0) {
_cleanup_fclose_ FILE *f = NULL;
_cleanup_free_ char *p = NULL;
uint64_t wr = 0, rd = 0;

View File

@ -283,7 +283,7 @@ fail:
return -errno;
}
void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state) {
void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state) {
bool is_root;
int r;
@ -304,7 +304,7 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
* cgroup trees (assuming we are running in a container then),
* and missing cgroups, i.e. EROFS and ENOENT. */
if ((mask & CGROUP_CPU) && !is_root) {
if ((mask & CGROUP_MASK_CPU) && !is_root) {
char buf[MAX(DECIMAL_STR_MAX(unsigned long), DECIMAL_STR_MAX(usec_t)) + 1];
sprintf(buf, "%lu\n",
@ -331,7 +331,7 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
"Failed to set cpu.cfs_quota_us on %s: %m", path);
}
if (mask & CGROUP_BLKIO) {
if (mask & CGROUP_MASK_BLKIO) {
char buf[MAX3(DECIMAL_STR_MAX(unsigned long)+1,
DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(unsigned long)*1,
DECIMAL_STR_MAX(dev_t)*2+2+DECIMAL_STR_MAX(uint64_t)+1)];
@ -381,21 +381,30 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
}
}
if ((mask & CGROUP_MEMORY) && !is_root) {
if ((mask & CGROUP_MASK_MEMORY) && !is_root) {
if (c->memory_limit != (uint64_t) -1) {
char buf[DECIMAL_STR_MAX(uint64_t) + 1];
sprintf(buf, "%" PRIu64 "\n", c->memory_limit);
r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
} else
r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
if (cg_unified() <= 0)
r = cg_set_attribute("memory", path, "memory.limit_in_bytes", buf);
else
r = cg_set_attribute("memory", path, "memory.max", buf);
} else {
if (cg_unified() <= 0)
r = cg_set_attribute("memory", path, "memory.limit_in_bytes", "-1");
else
r = cg_set_attribute("memory", path, "memory.max", "max");
}
if (r < 0)
log_full_errno(IN_SET(r, -ENOENT, -EROFS) ? LOG_DEBUG : LOG_WARNING, r,
"Failed to set memory.limit_in_bytes on %s: %m", path);
"Failed to set memory.limit_in_bytes/memory.max on %s: %m", path);
}
if ((mask & CGROUP_DEVICE) && !is_root) {
if ((mask & CGROUP_MASK_DEVICE) && !is_root) {
CGroupDeviceAllow *a;
/* Changing the devices list of a populated cgroup
@ -459,8 +468,8 @@ void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const cha
}
}
CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
CGroupControllerMask mask = 0;
CGroupMask cgroup_context_get_mask(CGroupContext *c) {
CGroupMask mask = 0;
/* Figure out which controllers we need */
@ -468,29 +477,31 @@ CGroupControllerMask cgroup_context_get_mask(CGroupContext *c) {
c->cpu_shares != (unsigned long) -1 ||
c->startup_cpu_shares != (unsigned long) -1 ||
c->cpu_quota_per_sec_usec != USEC_INFINITY)
mask |= CGROUP_CPUACCT | CGROUP_CPU;
mask |= CGROUP_MASK_CPUACCT | CGROUP_MASK_CPU;
if (c->blockio_accounting ||
c->blockio_weight != (unsigned long) -1 ||
c->startup_blockio_weight != (unsigned long) -1 ||
c->blockio_device_weights ||
c->blockio_device_bandwidths)
mask |= CGROUP_BLKIO;
mask |= CGROUP_MASK_BLKIO;
if (c->memory_accounting ||
c->memory_limit != (uint64_t) -1)
mask |= CGROUP_MEMORY;
mask |= CGROUP_MASK_MEMORY;
if (c->device_allow ||
c->device_policy != CGROUP_AUTO)
mask |= CGROUP_DEVICE;
mask |= CGROUP_MASK_DEVICE;
return mask;
}
CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
CGroupMask unit_get_own_mask(Unit *u) {
CGroupContext *c;
/* Returns the mask of controllers the unit needs for itself */
c = unit_get_cgroup_context(u);
if (!c)
return 0;
@ -505,15 +516,18 @@ CGroupControllerMask unit_get_cgroup_mask(Unit *u) {
e = unit_get_exec_context(u);
if (!e || exec_context_maintains_privileges(e))
return _CGROUP_CONTROLLER_MASK_ALL;
return _CGROUP_MASK_ALL;
}
return cgroup_context_get_mask(c);
}
CGroupControllerMask unit_get_members_mask(Unit *u) {
CGroupMask unit_get_members_mask(Unit *u) {
assert(u);
/* Returns the mask of controllers all of the unit's children
* require, merged */
if (u->cgroup_members_mask_valid)
return u->cgroup_members_mask;
@ -532,7 +546,7 @@ CGroupControllerMask unit_get_members_mask(Unit *u) {
continue;
u->cgroup_members_mask |=
unit_get_cgroup_mask(member) |
unit_get_own_mask(member) |
unit_get_members_mask(member);
}
}
@ -541,19 +555,52 @@ CGroupControllerMask unit_get_members_mask(Unit *u) {
return u->cgroup_members_mask;
}
CGroupControllerMask unit_get_siblings_mask(Unit *u) {
CGroupMask unit_get_siblings_mask(Unit *u) {
assert(u);
/* Returns the mask of controllers all of the unit's siblings
* require, i.e. the members mask of the unit's parent slice
* if there is one. */
if (UNIT_ISSET(u->slice))
return unit_get_members_mask(UNIT_DEREF(u->slice));
return unit_get_cgroup_mask(u) | unit_get_members_mask(u);
return unit_get_own_mask(u) | unit_get_members_mask(u);
}
CGroupControllerMask unit_get_target_mask(Unit *u) {
CGroupControllerMask mask;
CGroupMask unit_get_subtree_mask(Unit *u) {
mask = unit_get_cgroup_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
/* Returns the mask of this subtree, meaning of the group
* itself and its children. */
return unit_get_own_mask(u) | unit_get_members_mask(u);
}
CGroupMask unit_get_target_mask(Unit *u) {
CGroupMask mask;
/* This returns the cgroup mask of all controllers to enable
* for a specific cgroup, i.e. everything it needs itself,
* plus all that its children need, plus all that its siblings
* need. This is primarily useful on the legacy cgroup
* hierarchy, where we need to duplicate each cgroup in each
* hierarchy that shall be enabled for it. */
mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
mask &= u->manager->cgroup_supported;
return mask;
}
CGroupMask unit_get_enable_mask(Unit *u) {
CGroupMask mask;
/* This returns the cgroup mask of all controllers to enable
* for the children of a specific cgroup. This is primarily
* useful for the unified cgroup hierarchy, where each cgroup
* controls which controllers are enabled for its children. */
mask = unit_get_members_mask(u);
mask &= u->manager->cgroup_supported;
return mask;
@ -562,13 +609,13 @@ CGroupControllerMask unit_get_target_mask(Unit *u) {
/* Recurse from a unit up through its containing slices, propagating
* mask bits upward. A unit is also member of itself. */
void unit_update_cgroup_members_masks(Unit *u) {
CGroupControllerMask m;
CGroupMask m;
bool more;
assert(u);
/* Calculate subtree mask */
m = unit_get_cgroup_mask(u) | unit_get_members_mask(u);
m = unit_get_subtree_mask(u);
/* See if anything changed from the previous invocation. If
* not, we're done. */
@ -608,7 +655,7 @@ void unit_update_cgroup_members_masks(Unit *u) {
}
}
static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
static const char *migrate_callback(CGroupMask mask, void *userdata) {
Unit *u = userdata;
assert(mask != 0);
@ -626,7 +673,115 @@ static const char *migrate_callback(CGroupControllerMask mask, void *userdata) {
return NULL;
}
static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
char *unit_default_cgroup_path(Unit *u) {
_cleanup_free_ char *escaped = NULL, *slice = NULL;
int r;
assert(u);
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
return strdup(u->manager->cgroup_root);
if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
if (r < 0)
return NULL;
}
escaped = cg_escape(u->id);
if (!escaped)
return NULL;
if (slice)
return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
else
return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
}
int unit_set_cgroup_path(Unit *u, const char *path) {
_cleanup_free_ char *p = NULL;
int r;
assert(u);
if (path) {
p = strdup(path);
if (!p)
return -ENOMEM;
} else
p = NULL;
if (streq_ptr(u->cgroup_path, p))
return 0;
if (p) {
r = hashmap_put(u->manager->cgroup_unit, p, u);
if (r < 0)
return r;
}
unit_release_cgroup(u);
u->cgroup_path = p;
p = NULL;
return 1;
}
int unit_watch_cgroup(Unit *u) {
_cleanup_free_ char *populated = NULL;
int r;
assert(u);
if (!u->cgroup_path)
return 0;
if (u->cgroup_inotify_wd >= 0)
return 0;
/* Only applies to the unified hierarchy */
r = cg_unified();
if (r < 0)
return log_unit_error_errno(u, r, "Failed detect wether the unified hierarchy is used: %m");
if (r == 0)
return 0;
/* Don't watch the root slice, it's pointless. */
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
return 0;
r = hashmap_ensure_allocated(&u->manager->cgroup_inotify_wd_unit, &trivial_hash_ops);
if (r < 0)
return log_oom();
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "cgroup.populated", &populated);
if (r < 0)
return log_oom();
u->cgroup_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, populated, IN_MODIFY);
if (u->cgroup_inotify_wd < 0) {
if (errno == ENOENT) /* If the directory is already
* gone we don't need to track
* it, so this is not an error */
return 0;
return log_unit_error_errno(u, errno, "Failed to add inotify watch descriptor for control group %s: %m", u->cgroup_path);
}
r = hashmap_put(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd), u);
if (r < 0)
return log_unit_error_errno(u, r, "Failed to add inotify watch descriptor to hash map: %m");
return 0;
}
static int unit_create_cgroup(
Unit *u,
CGroupMask target_mask,
CGroupMask enable_mask) {
CGroupContext *c;
int r;
@ -643,25 +798,29 @@ static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
if (!path)
return log_oom();
r = hashmap_put(u->manager->cgroup_unit, path, u);
if (r < 0) {
log_error(r == -EEXIST ? "cgroup %s exists already: %s" : "hashmap_put failed for %s: %s", path, strerror(-r));
return r;
}
if (r > 0) {
u->cgroup_path = path;
path = NULL;
}
r = unit_set_cgroup_path(u, path);
if (r == -EEXIST)
return log_unit_error_errno(u, r, "Control group %s exists already.", path);
if (r < 0)
return log_unit_error_errno(u, r, "Failed to set unit's control group path to %s: %m", path);
}
/* First, create our own group */
r = cg_create_everywhere(u->manager->cgroup_supported, mask, u->cgroup_path);
r = cg_create_everywhere(u->manager->cgroup_supported, target_mask, u->cgroup_path);
if (r < 0)
return log_error_errno(r, "Failed to create cgroup %s: %m", u->cgroup_path);
return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", u->cgroup_path);
/* Start watching it */
(void) unit_watch_cgroup(u);
/* Enable all controllers we need */
r = cg_enable_everywhere(u->manager->cgroup_supported, enable_mask, u->cgroup_path);
if (r < 0)
log_unit_warning_errno(u, r, "Failed to enable controllers on cgroup %s, ignoring: %m", u->cgroup_path);
/* Keep track that this is now realized */
u->cgroup_realized = true;
u->cgroup_realized_mask = mask;
u->cgroup_realized_mask = target_mask;
if (u->type != UNIT_SLICE && !c->delegate) {
@ -670,7 +829,7 @@ static int unit_create_cgroups(Unit *u, CGroupControllerMask mask) {
* for slice and delegation units. */
r = cg_migrate_everywhere(u->manager->cgroup_supported, u->cgroup_path, u->cgroup_path, migrate_callback, u);
if (r < 0)
log_warning_errno(r, "Failed to migrate cgroup from to %s: %m", u->cgroup_path);
log_unit_warning_errno(u, r, "Failed to migrate cgroup from to %s, ignoring: %m", u->cgroup_path);
}
return 0;
@ -691,10 +850,10 @@ int unit_attach_pids_to_cgroup(Unit *u) {
return 0;
}
static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
static bool unit_has_mask_realized(Unit *u, CGroupMask target_mask) {
assert(u);
return u->cgroup_realized && u->cgroup_realized_mask == mask;
return u->cgroup_realized && u->cgroup_realized_mask == target_mask;
}
/* Check if necessary controllers and attributes for a unit are in place.
@ -704,7 +863,7 @@ static bool unit_has_mask_realized(Unit *u, CGroupControllerMask mask) {
*
* Returns 0 on success and < 0 on failure. */
static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
CGroupControllerMask mask;
CGroupMask target_mask, enable_mask;
int r;
assert(u);
@ -714,9 +873,8 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
u->in_cgroup_queue = false;
}
mask = unit_get_target_mask(u);
if (unit_has_mask_realized(u, mask))
target_mask = unit_get_target_mask(u);
if (unit_has_mask_realized(u, target_mask))
return 0;
/* First, realize parents */
@ -727,12 +885,13 @@ static int unit_realize_cgroup_now(Unit *u, ManagerState state) {
}
/* And then do the real work */
r = unit_create_cgroups(u, mask);
enable_mask = unit_get_enable_mask(u);
r = unit_create_cgroup(u, target_mask, enable_mask);
if (r < 0)
return r;
/* Finally, apply the necessary attributes. */
cgroup_context_apply(unit_get_cgroup_context(u), mask, u->cgroup_path, state);
cgroup_context_apply(unit_get_cgroup_context(u), target_mask, u->cgroup_path, state);
return 0;
}
@ -759,7 +918,7 @@ unsigned manager_dispatch_cgroup_queue(Manager *m) {
r = unit_realize_cgroup_now(i, state);
if (r < 0)
log_warning_errno(r, "Failed to realize cgroups for queued unit %s: %m", i->id);
log_warning_errno(r, "Failed to realize cgroups for queued unit %s, ignoring: %m", i->id);
n++;
}
@ -829,39 +988,67 @@ int unit_realize_cgroup(Unit *u) {
return unit_realize_cgroup_now(u, manager_state(u->manager));
}
void unit_destroy_cgroup_if_empty(Unit *u) {
void unit_release_cgroup(Unit *u) {
assert(u);
/* Forgets all cgroup details for this cgroup */
if (u->cgroup_path) {
(void) hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
u->cgroup_path = mfree(u->cgroup_path);
}
if (u->cgroup_inotify_wd >= 0) {
if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_inotify_wd) < 0)
log_unit_debug_errno(u, errno, "Failed to remove cgroup inotify watch %i for %s, ignoring", u->cgroup_inotify_wd, u->id);
(void) hashmap_remove(u->manager->cgroup_inotify_wd_unit, INT_TO_PTR(u->cgroup_inotify_wd));
u->cgroup_inotify_wd = -1;
}
}
void unit_prune_cgroup(Unit *u) {
int r;
bool is_root_slice;
assert(u);
/* Removes the cgroup, if empty and possible, and stops watching it. */
if (!u->cgroup_path)
return;
r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !unit_has_name(u, SPECIAL_ROOT_SLICE));
is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
if (r < 0) {
log_debug_errno(r, "Failed to destroy cgroup %s: %m", u->cgroup_path);
log_debug_errno(r, "Failed to destroy cgroup %s, ignoring: %m", u->cgroup_path);
return;
}
hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
if (is_root_slice)
return;
unit_release_cgroup(u);
free(u->cgroup_path);
u->cgroup_path = NULL;
u->cgroup_realized = false;
u->cgroup_realized_mask = 0;
}
pid_t unit_search_main_pid(Unit *u) {
int unit_search_main_pid(Unit *u, pid_t *ret) {
_cleanup_fclose_ FILE *f = NULL;
pid_t pid = 0, npid, mypid;
int r;
assert(u);
assert(ret);
if (!u->cgroup_path)
return 0;
return -ENXIO;
if (cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f) < 0)
return 0;
r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, &f);
if (r < 0)
return r;
mypid = getpid();
while (cg_read_pid(f, &npid) > 0) {
@ -874,90 +1061,274 @@ pid_t unit_search_main_pid(Unit *u) {
if (get_parent_of_pid(npid, &ppid) >= 0 && ppid != mypid)
continue;
if (pid != 0) {
if (pid != 0)
/* Dang, there's more than one daemonized PID
in this group, so we don't know what process
is the main process. */
pid = 0;
break;
}
return -ENODATA;
pid = npid;
}
return pid;
*ret = pid;
return 0;
}
static int unit_watch_pids_in_path(Unit *u, const char *path) {
_cleanup_closedir_ DIR *d = NULL;
_cleanup_fclose_ FILE *f = NULL;
int ret = 0, r;
assert(u);
assert(path);
r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
if (r < 0)
ret = r;
else {
pid_t pid;
while ((r = cg_read_pid(f, &pid)) > 0) {
r = unit_watch_pid(u, pid);
if (r < 0 && ret >= 0)
ret = r;
}
if (r < 0 && ret >= 0)
ret = r;
}
r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
if (r < 0) {
if (ret >= 0)
ret = r;
} else {
char *fn;
while ((r = cg_read_subgroup(d, &fn)) > 0) {
_cleanup_free_ char *p = NULL;
p = strjoin(path, "/", fn, NULL);
free(fn);
if (!p)
return -ENOMEM;
r = unit_watch_pids_in_path(u, p);
if (r < 0 && ret >= 0)
ret = r;
}
if (r < 0 && ret >= 0)
ret = r;
}
return ret;
}
int unit_watch_all_pids(Unit *u) {
assert(u);
/* Adds all PIDs from our cgroup to the set of PIDs we
* watch. This is a fallback logic for cases where we do not
* get reliable cgroup empty notifications: we try to use
* SIGCHLD as replacement. */
if (!u->cgroup_path)
return -ENOENT;
if (cg_unified() > 0) /* On unified we can use proper notifications */
return 0;
return unit_watch_pids_in_path(u, u->cgroup_path);
}
int unit_notify_cgroup_empty(Unit *u) {
int r;
assert(u);
if (!u->cgroup_path)
return 0;
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
if (r <= 0)
return r;
unit_add_to_gc_queue(u);
if (UNIT_VTABLE(u)->notify_cgroup_empty)
UNIT_VTABLE(u)->notify_cgroup_empty(u);
return 0;
}
static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
Manager *m = userdata;
assert(s);
assert(fd >= 0);
assert(m);
for (;;) {
union inotify_event_buffer buffer;
struct inotify_event *e;
ssize_t l;
l = read(fd, &buffer, sizeof(buffer));
if (l < 0) {
if (errno == EINTR || errno == EAGAIN)
return 0;
return log_error_errno(errno, "Failed to read control group inotify events: %m");
}
FOREACH_INOTIFY_EVENT(e, buffer, l) {
Unit *u;
if (e->wd < 0)
/* Queue overflow has no watch descriptor */
continue;
if (e->mask & IN_IGNORED)
/* The watch was just removed */
continue;
u = hashmap_get(m->cgroup_inotify_wd_unit, INT_TO_PTR(e->wd));
if (!u) /* Not that inotify might deliver
* events for a watch even after it
* was removed, because it was queued
* before the removal. Let's ignore
* this here safely. */
continue;
(void) unit_notify_cgroup_empty(u);
}
}
}
int manager_setup_cgroup(Manager *m) {
_cleanup_free_ char *path = NULL;
int r;
CGroupController c;
int r, unified;
char *e;
assert(m);
/* 1. Determine hierarchy */
free(m->cgroup_root);
m->cgroup_root = NULL;
m->cgroup_root = mfree(m->cgroup_root);
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &m->cgroup_root);
if (r < 0)
return log_error_errno(r, "Cannot determine cgroup we are running in: %m");
/* LEGACY: Already in /system.slice? If so, let's cut this
* off. This is to support live upgrades from older systemd
* versions where PID 1 was moved there. */
if (m->running_as == MANAGER_SYSTEM) {
char *e;
/* Chop off the init scope, if we are already located in it */
e = endswith(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
/* LEGACY: Also chop off the system slice if we are in
* it. This is to support live upgrades from older systemd
* versions where PID 1 was moved there. Also see
* cg_get_root_path(). */
if (!e && m->running_as == MANAGER_SYSTEM) {
e = endswith(m->cgroup_root, "/" SPECIAL_SYSTEM_SLICE);
if (!e)
e = endswith(m->cgroup_root, "/system");
if (e)
*e = 0;
e = endswith(m->cgroup_root, "/system"); /* even more legacy */
}
if (e)
*e = 0;
/* And make sure to store away the root value without trailing
* slash, even for the root dir, so that we can easily prepend
* it everywhere. */
if (streq(m->cgroup_root, "/"))
m->cgroup_root[0] = 0;
while ((e = endswith(m->cgroup_root, "/")))
*e = 0;
/* 2. Show data */
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, NULL, &path);
if (r < 0)
return log_error_errno(r, "Cannot find cgroup mount point: %m");
log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
unified = cg_unified();
if (unified < 0)
return log_error_errno(r, "Couldn't determine if we are running in the unified hierarchy: %m");
if (unified > 0)
log_debug("Unified cgroup hierarchy is located at %s.", path);
else
log_debug("Using cgroup controller " SYSTEMD_CGROUP_CONTROLLER ". File system hierarchy is at %s.", path);
if (!m->test_run) {
const char *scope_path;
/* 3. Install agent */
if (m->running_as == MANAGER_SYSTEM) {
if (unified) {
/* In the unified hierarchy we can can get
* cgroup empty notifications via inotify. */
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
safe_close(m->cgroup_inotify_fd);
m->cgroup_inotify_fd = inotify_init1(IN_NONBLOCK|IN_CLOEXEC);
if (m->cgroup_inotify_fd < 0)
return log_error_errno(errno, "Failed to create control group inotify object: %m");
r = sd_event_add_io(m->event, &m->cgroup_inotify_event_source, m->cgroup_inotify_fd, EPOLLIN, on_cgroup_inotify_event, m);
if (r < 0)
return log_error_errno(r, "Failed to watch control group inotify object: %m");
r = sd_event_source_set_priority(m->cgroup_inotify_event_source, SD_EVENT_PRIORITY_IDLE - 5);
if (r < 0)
return log_error_errno(r, "Failed to set priority of inotify event source: %m");
(void) sd_event_source_set_description(m->cgroup_inotify_event_source, "cgroup-inotify");
} else if (m->running_as == MANAGER_SYSTEM) {
/* On the legacy hierarchy we only get
* notifications via cgroup agents. (Which
* isn't really reliable, since it does not
* generate events when control groups with
* children run empty. */
r = cg_install_release_agent(SYSTEMD_CGROUP_CONTROLLER, SYSTEMD_CGROUP_AGENT_PATH);
if (r < 0)
log_warning_errno(r, "Failed to install release agent, ignoring: %m");
else if (r > 0)
log_debug("Installed release agent.");
else
else if (r == 0)
log_debug("Release agent already installed.");
}
/* 4. Make sure we are in the root cgroup */
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, 0);
/* 4. Make sure we are in the special "init.scope" unit in the root slice. */
scope_path = strjoina(m->cgroup_root, "/" SPECIAL_INIT_SCOPE);
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, scope_path, 0);
if (r < 0)
return log_error_errno(r, "Failed to create root cgroup hierarchy: %m");
return log_error_errno(r, "Failed to create %s control group: %m", scope_path);
/* also, move all other userspace processes remaining
* in the root cgroup into that scope. */
r = cg_migrate(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, SYSTEMD_CGROUP_CONTROLLER, scope_path, false);
if (r < 0)
log_warning_errno(r, "Couldn't move remaining userspace processes, ignoring: %m");
/* 5. And pin it, so that it cannot be unmounted */
safe_close(m->pin_cgroupfs_fd);
m->pin_cgroupfs_fd = open(path, O_RDONLY|O_CLOEXEC|O_DIRECTORY|O_NOCTTY|O_NONBLOCK);
if (m->pin_cgroupfs_fd < 0)
return log_error_errno(errno, "Failed to open pin file: %m");
/* 6. Always enable hierarchical support if it exists... */
cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
if (!unified)
(void) cg_set_attribute("memory", "/", "memory.use_hierarchy", "1");
}
/* 7. Figure out which controllers are supported */
m->cgroup_supported = cg_mask_supported();
r = cg_mask_supported(&m->cgroup_supported);
if (r < 0)
return log_error_errno(r, "Failed to determine supported controllers: %m");
for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++)
log_debug("Controller '%s' supported: %s", cgroup_controller_to_string(c), yes_no(m->cgroup_supported & c));
return 0;
}
@ -968,12 +1339,16 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {
/* We can't really delete the group, since we are in it. But
* let's trim it. */
if (delete && m->cgroup_root)
cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
(void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, m->cgroup_root, false);
m->cgroup_inotify_wd_unit = hashmap_free(m->cgroup_inotify_wd_unit);
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
m->pin_cgroupfs_fd = safe_close(m->pin_cgroupfs_fd);
free(m->cgroup_root);
m->cgroup_root = NULL;
m->cgroup_root = mfree(m->cgroup_root);
}
Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
@ -992,8 +1367,8 @@ Unit* manager_get_unit_by_cgroup(Manager *m, const char *cgroup) {
char *e;
e = strrchr(p, '/');
if (e == p || !e)
return NULL;
if (!e || e == p)
return hashmap_get(m->cgroup_unit, SPECIAL_ROOT_SLICE);
*e = 0;
@ -1010,9 +1385,12 @@ Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
assert(m);
if (pid <= 1)
if (pid <= 0)
return NULL;
if (pid == 1)
return hashmap_get(m->units, SPECIAL_INIT_SCOPE);
u = hashmap_get(m->watch_pids1, LONG_TO_PTR(pid));
if (u)
return u;
@ -1030,7 +1408,6 @@ Unit *manager_get_unit_by_pid(Manager *m, pid_t pid) {
int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
Unit *u;
int r;
assert(m);
assert(cgroup);
@ -1039,15 +1416,7 @@ int manager_notify_cgroup_empty(Manager *m, const char *cgroup) {
if (!u)
return 0;
r = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path);
if (r <= 0)
return r;
if (UNIT_VTABLE(u)->notify_cgroup_empty)
UNIT_VTABLE(u)->notify_cgroup_empty(u);
unit_add_to_gc_queue(u);
return 0;
return unit_notify_cgroup_empty(u);
}
int unit_get_memory_current(Unit *u, uint64_t *ret) {
@ -1060,10 +1429,13 @@ int unit_get_memory_current(Unit *u, uint64_t *ret) {
if (!u->cgroup_path)
return -ENODATA;
if ((u->cgroup_realized_mask & CGROUP_MEMORY) == 0)
if ((u->cgroup_realized_mask & CGROUP_MASK_MEMORY) == 0)
return -ENODATA;
r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
if (cg_unified() <= 0)
r = cg_get_attribute("memory", u->cgroup_path, "memory.usage_in_bytes", &v);
else
r = cg_get_attribute("memory", u->cgroup_path, "memory.current", &v);
if (r == -ENOENT)
return -ENODATA;
if (r < 0)
@ -1083,7 +1455,7 @@ static int unit_get_cpu_usage_raw(Unit *u, nsec_t *ret) {
if (!u->cgroup_path)
return -ENODATA;
if ((u->cgroup_realized_mask & CGROUP_CPUACCT) == 0)
if ((u->cgroup_realized_mask & CGROUP_MASK_CPUACCT) == 0)
return -ENODATA;
r = cg_get_attribute("cpuacct", u->cgroup_path, "cpuacct.usage", &v);

View File

@ -96,22 +96,32 @@ struct CGroupContext {
void cgroup_context_init(CGroupContext *c);
void cgroup_context_done(CGroupContext *c);
void cgroup_context_dump(CGroupContext *c, FILE* f, const char *prefix);
void cgroup_context_apply(CGroupContext *c, CGroupControllerMask mask, const char *path, ManagerState state);
void cgroup_context_apply(CGroupContext *c, CGroupMask mask, const char *path, ManagerState state);
CGroupControllerMask cgroup_context_get_mask(CGroupContext *c);
CGroupMask cgroup_context_get_mask(CGroupContext *c);
void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a);
void cgroup_context_free_blockio_device_weight(CGroupContext *c, CGroupBlockIODeviceWeight *w);
void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
CGroupControllerMask unit_get_cgroup_mask(Unit *u);
CGroupControllerMask unit_get_siblings_mask(Unit *u);
CGroupControllerMask unit_get_members_mask(Unit *u);
CGroupControllerMask unit_get_target_mask(Unit *u);
CGroupMask unit_get_own_mask(Unit *u);
CGroupMask unit_get_siblings_mask(Unit *u);
CGroupMask unit_get_members_mask(Unit *u);
CGroupMask unit_get_subtree_mask(Unit *u);
CGroupMask unit_get_target_mask(Unit *u);
CGroupMask unit_get_enable_mask(Unit *u);
void unit_update_cgroup_members_masks(Unit *u);
char *unit_default_cgroup_path(Unit *u);
int unit_set_cgroup_path(Unit *u, const char *path);
int unit_realize_cgroup(Unit *u);
void unit_destroy_cgroup_if_empty(Unit *u);
void unit_release_cgroup(Unit *u);
void unit_prune_cgroup(Unit *u);
int unit_watch_cgroup(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u);
int manager_setup_cgroup(Manager *m);
@ -122,9 +132,8 @@ unsigned manager_dispatch_cgroup_queue(Manager *m);
Unit *manager_get_unit_by_cgroup(Manager *m, const char *cgroup);
Unit* manager_get_unit_by_pid(Manager *m, pid_t pid);
pid_t unit_search_main_pid(Unit *u);
int manager_notify_cgroup_empty(Manager *m, const char *group);
int unit_search_main_pid(Unit *u, pid_t *ret);
int unit_watch_all_pids(Unit *u);
int unit_get_memory_current(Unit *u, uint64_t *ret);
int unit_get_cpu_usage(Unit *u, nsec_t *ret);
@ -132,5 +141,8 @@ int unit_reset_cpu_usage(Unit *u);
bool unit_cgroup_delegate(Unit *u);
int unit_notify_cgroup_empty(Unit *u);
int manager_notify_cgroup_empty(Manager *m, const char *group);
const char* cgroup_device_policy_to_string(CGroupDevicePolicy i) _const_;
CGroupDevicePolicy cgroup_device_policy_from_string(const char *s) _pure_;

View File

@ -228,7 +228,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->cpu_accounting = b;
u->cgroup_realized_mask &= ~CGROUP_CPUACCT;
u->cgroup_realized_mask &= ~CGROUP_MASK_CPUACCT;
unit_write_drop_in_private(u, mode, name, b ? "CPUAccounting=yes" : "CPUAccounting=no");
}
@ -252,7 +252,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->cpu_shares = ul;
u->cgroup_realized_mask &= ~CGROUP_CPU;
u->cgroup_realized_mask &= ~CGROUP_MASK_CPU;
unit_write_drop_in_private_format(u, mode, name, "CPUShares=%lu", ul);
}
@ -276,7 +276,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->startup_cpu_shares = ul;
u->cgroup_realized_mask &= ~CGROUP_CPU;
u->cgroup_realized_mask &= ~CGROUP_MASK_CPU;
unit_write_drop_in_private_format(u, mode, name, "StartupCPUShares=%lu", ul);
}
@ -294,7 +294,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->cpu_quota_per_sec_usec = u64;
u->cgroup_realized_mask &= ~CGROUP_CPU;
u->cgroup_realized_mask &= ~CGROUP_MASK_CPU;
unit_write_drop_in_private_format(u, mode, "CPUQuota", "CPUQuota=%0.f%%", (double) (c->cpu_quota_per_sec_usec / 10000));
}
@ -309,7 +309,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->blockio_accounting = b;
u->cgroup_realized_mask &= ~CGROUP_BLKIO;
u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
unit_write_drop_in_private(u, mode, name, b ? "BlockIOAccounting=yes" : "BlockIOAccounting=no");
}
@ -333,7 +333,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->blockio_weight = ul;
u->cgroup_realized_mask &= ~CGROUP_BLKIO;
u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
unit_write_drop_in_private_format(u, mode, name, "BlockIOWeight=%lu", ul);
}
@ -357,7 +357,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->startup_blockio_weight = ul;
u->cgroup_realized_mask &= ~CGROUP_BLKIO;
u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
unit_write_drop_in_private_format(u, mode, name, "StartupBlockIOWeight=%lu", ul);
}
@ -427,7 +427,7 @@ int bus_cgroup_set_property(
cgroup_context_free_blockio_device_bandwidth(c, a);
}
u->cgroup_realized_mask &= ~CGROUP_BLKIO;
u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
f = open_memstream(&buf, &size);
if (!f)
@ -510,7 +510,7 @@ int bus_cgroup_set_property(
cgroup_context_free_blockio_device_weight(c, c->blockio_device_weights);
}
u->cgroup_realized_mask &= ~CGROUP_BLKIO;
u->cgroup_realized_mask &= ~CGROUP_MASK_BLKIO;
f = open_memstream(&buf, &size);
if (!f)
@ -535,7 +535,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->memory_accounting = b;
u->cgroup_realized_mask &= ~CGROUP_MEMORY;
u->cgroup_realized_mask &= ~CGROUP_MASK_MEMORY;
unit_write_drop_in_private(u, mode, name, b ? "MemoryAccounting=yes" : "MemoryAccounting=no");
}
@ -550,7 +550,7 @@ int bus_cgroup_set_property(
if (mode != UNIT_CHECK) {
c->memory_limit = limit;
u->cgroup_realized_mask &= ~CGROUP_MEMORY;
u->cgroup_realized_mask &= ~CGROUP_MASK_MEMORY;
unit_write_drop_in_private_format(u, mode, name, "%s=%" PRIu64, name, limit);
}
@ -572,7 +572,7 @@ int bus_cgroup_set_property(
char *buf;
c->device_policy = p;
u->cgroup_realized_mask &= ~CGROUP_DEVICE;
u->cgroup_realized_mask &= ~CGROUP_MASK_DEVICE;
buf = strjoina("DevicePolicy=", policy);
unit_write_drop_in_private(u, mode, name, buf);
@ -651,7 +651,7 @@ int bus_cgroup_set_property(
cgroup_context_free_device_allow(c, c->device_allow);
}
u->cgroup_realized_mask &= ~CGROUP_DEVICE;
u->cgroup_realized_mask &= ~CGROUP_MASK_DEVICE;
f = open_memstream(&buf, &size);
if (!f)

View File

@ -25,6 +25,7 @@
#include "cgroup-util.h"
#include "strv.h"
#include "bus-common-errors.h"
#include "special.h"
#include "dbus.h"
#include "dbus-unit.h"
@ -973,6 +974,8 @@ static int bus_unit_set_transient_property(
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "The slice property is only available for units with control groups.");
if (u->type == UNIT_SLICE)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Slice may not be set for slice units.");
if (unit_has_name(u, SPECIAL_INIT_SCOPE))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set slice for init.scope");
r = sd_bus_message_read(message, "s", &s);
if (r < 0)

View File

@ -214,7 +214,7 @@ struct ExecParameters {
bool apply_tty_stdin;
bool confirm_spawn;
bool selinux_context_net;
CGroupControllerMask cgroup_supported;
CGroupMask cgroup_supported;
const char *cgroup_path;
bool cgroup_delegate;
const char *runtime_prefix;

View File

@ -568,7 +568,9 @@ int manager_new(ManagerRunningAs running_as, bool test_run, Manager **_m) {
m->idle_pipe[0] = m->idle_pipe[1] = m->idle_pipe[2] = m->idle_pipe[3] = -1;
m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd = m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd = -1;
m->pin_cgroupfs_fd = m->notify_fd = m->signal_fd = m->time_change_fd =
m->dev_autofs_fd = m->private_listen_fd = m->kdbus_fd = m->utab_inotify_fd =
m->cgroup_inotify_fd = -1;
m->current_job_id = 1; /* start as id #1, so that we can leave #0 around as "null-like" value */
m->ask_password_inotify_fd = -1;
@ -2722,7 +2724,7 @@ void manager_check_finished(Manager *m) {
SET_FOREACH(u, m->startup_units, i)
if (u->cgroup_path)
cgroup_context_apply(unit_get_cgroup_context(u), unit_get_cgroup_mask(u), u->cgroup_path, manager_state(m));
cgroup_context_apply(unit_get_cgroup_context(u), unit_get_own_mask(u), u->cgroup_path, manager_state(m));
}
static int create_generator_dir(Manager *m, char **generator, const char *name) {

View File

@ -215,16 +215,22 @@ struct Manager {
/* Data specific to the cgroup subsystem */
Hashmap *cgroup_unit;
CGroupControllerMask cgroup_supported;
CGroupMask cgroup_supported;
char *cgroup_root;
int gc_marker;
unsigned n_in_gc_queue;
/* Notifications from cgroups, when the unified hierarchy is
* used is done via inotify. */
int cgroup_inotify_fd;
sd_event_source *cgroup_inotify_event_source;
Hashmap *cgroup_inotify_wd_unit;
/* Make sure the user cannot accidentally unmount our cgroup
* file system */
int pin_cgroupfs_fd;
int gc_marker;
unsigned n_in_gc_queue;
/* Flags */
ManagerRunningAs running_as;
ManagerExitCode exit_code:5;

View File

@ -93,12 +93,14 @@ static const MountPoint mount_table[] = {
#endif
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup", "cgroup", "__DEVEL__sane_behavior", MS_NOSUID|MS_NOEXEC|MS_NODEV,
cg_is_unified_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd,xattr", MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_IN_CONTAINER },
cg_is_legacy_wanted, MNT_IN_CONTAINER },
{ "cgroup", "/sys/fs/cgroup/systemd", "cgroup", "none,name=systemd", MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_FATAL|MNT_IN_CONTAINER },
cg_is_legacy_wanted, MNT_FATAL|MNT_IN_CONTAINER },
{ "pstore", "/sys/fs/pstore", "pstore", NULL, MS_NOSUID|MS_NOEXEC|MS_NODEV,
NULL, MNT_NONE },
#ifdef ENABLE_EFI
@ -217,6 +219,9 @@ int mount_cgroup_controllers(char ***join_controllers) {
_cleanup_set_free_free_ Set *controllers = NULL;
int r;
if (!cg_is_legacy_wanted())
return 0;
/* Mount all available cgroup controllers that are built into the kernel. */
controllers = set_new(&string_hash_ops);

View File

@ -22,12 +22,13 @@
#include <errno.h>
#include <unistd.h>
#include "unit.h"
#include "scope.h"
#include "log.h"
#include "dbus-scope.h"
#include "strv.h"
#include "special.h"
#include "unit-name.h"
#include "unit.h"
#include "scope.h"
#include "dbus-scope.h"
#include "load-dropin.h"
static const UnitActiveState state_translation_table[_SCOPE_STATE_MAX] = {
@ -136,7 +137,9 @@ static int scope_verify(Scope *s) {
if (UNIT(s)->load_state != UNIT_LOADED)
return 0;
if (set_isempty(UNIT(s)->pids) && UNIT(s)->manager->n_reloading <= 0) {
if (set_isempty(UNIT(s)->pids) &&
!manager_is_reloading_or_reexecuting(UNIT(s)->manager) <= 0 &&
!unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE)) {
log_unit_error(UNIT(s), "Scope has no PIDs. Refusing.");
return -EINVAL;
}
@ -151,7 +154,7 @@ static int scope_load(Unit *u) {
assert(s);
assert(u->load_state == UNIT_STUB);
if (!u->transient && UNIT(s)->manager->n_reloading <= 0)
if (!u->transient && !manager_is_reloading_or_reexecuting(u->manager))
return -ENOENT;
u->load_state = UNIT_LOADED;
@ -279,6 +282,9 @@ static int scope_start(Unit *u) {
assert(s);
if (unit_has_name(u, SPECIAL_INIT_SCOPE))
return -EPERM;
if (s->state == SCOPE_FAILED)
return -EPERM;
@ -289,7 +295,7 @@ static int scope_start(Unit *u) {
assert(s->state == SCOPE_DEAD);
if (!u->transient && UNIT(s)->manager->n_reloading <= 0)
if (!u->transient && !manager_is_reloading_or_reexecuting(u->manager))
return -ENOENT;
(void) unit_realize_cgroup(u);
@ -464,6 +470,9 @@ static int scope_dispatch_timer(sd_event_source *source, usec_t usec, void *user
int scope_abandon(Scope *s) {
assert(s);
if (unit_has_name(UNIT(s), SPECIAL_INIT_SCOPE))
return -EPERM;
if (!IN_SET(s->state, SCOPE_RUNNING, SCOPE_ABANDONED))
return -ESTALE;
@ -499,6 +508,48 @@ _pure_ static const char *scope_sub_state_to_string(Unit *u) {
return scope_state_to_string(SCOPE(u)->state);
}
static int scope_enumerate(Manager *m) {
Unit *u;
int r;
assert(m);
/* Let's unconditionally add the "init.scope" special unit
* that encapsulates PID 1. Note that PID 1 already is in the
* cgroup for this, we hence just need to allocate the object
* for it and that's it. */
u = manager_get_unit(m, SPECIAL_INIT_SCOPE);
if (!u) {
u = unit_new(m, sizeof(Scope));
if (!u)
return log_oom();
r = unit_add_name(u, SPECIAL_INIT_SCOPE);
if (r < 0) {
unit_free(u);
return log_error_errno(r, "Failed to add init.scope name");
}
}
u->transient = true;
u->default_dependencies = false;
u->no_gc = true;
SCOPE(u)->deserialized_state = SCOPE_RUNNING;
SCOPE(u)->kill_context.kill_signal = SIGRTMIN+14;
/* Prettify things, if we can. */
if (!u->description)
u->description = strdup("System and Service Manager");
if (!u->documentation)
(void) strv_extend(&u->documentation, "man:systemd(1)");
unit_add_to_load_queue(u);
unit_add_to_dbus_queue(u);
return 0;
}
static const char* const scope_state_table[_SCOPE_STATE_MAX] = {
[SCOPE_DEAD] = "dead",
[SCOPE_RUNNING] = "running",
@ -565,5 +616,7 @@ const UnitVTable scope_vtable = {
.bus_set_property = bus_scope_set_property,
.bus_commit_properties = bus_scope_commit_properties,
.can_transient = true
.can_transient = true,
.enumerate = scope_enumerate,
};

View File

@ -767,7 +767,7 @@ static int service_load_pid_file(Service *s, bool may_warn) {
}
static int service_search_main_pid(Service *s) {
pid_t pid;
pid_t pid = 0;
int r;
assert(s);
@ -782,9 +782,9 @@ static int service_search_main_pid(Service *s) {
assert(s->main_pid <= 0);
pid = unit_search_main_pid(UNIT(s));
if (pid <= 0)
return -ENOENT;
r = unit_search_main_pid(UNIT(s), &pid);
if (r < 0)
return r;
log_unit_debug(UNIT(s), "Main PID guessed: "PID_FMT, pid);
r = service_set_main_pid(s, pid);
@ -860,7 +860,7 @@ static void service_set_state(Service *s, ServiceState state) {
/* For the inactive states unit_notify() will trim the cgroup,
* but for exit we have to do that ourselves... */
if (state == SERVICE_EXITED && UNIT(s)->manager->n_reloading <= 0)
unit_destroy_cgroup_if_empty(UNIT(s));
unit_prune_cgroup(UNIT(s));
/* For remain_after_exit services, let's see if we can "release" the
* hold on the console, since unit_notify() only does that in case of
@ -2644,7 +2644,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
break;
}
} else
service_search_main_pid(s);
(void) service_search_main_pid(s);
service_enter_start_post(s);
break;
@ -2666,7 +2666,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
break;
}
} else
service_search_main_pid(s);
(void) service_search_main_pid(s);
service_enter_running(s, SERVICE_SUCCESS);
break;
@ -2674,7 +2674,7 @@ static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
case SERVICE_RELOAD:
if (f == SERVICE_SUCCESS) {
service_load_pid_file(s, true);
service_search_main_pid(s);
(void) service_search_main_pid(s);
}
s->reload_result = f;

View File

@ -21,12 +21,13 @@
#include <errno.h>
#include "unit.h"
#include "slice.h"
#include "log.h"
#include "dbus-slice.h"
#include "strv.h"
#include "special.h"
#include "unit-name.h"
#include "unit.h"
#include "slice.h"
#include "dbus-slice.h"
static const UnitActiveState state_translation_table[_SLICE_STATE_MAX] = {
[SLICE_DEAD] = UNIT_INACTIVE,
@ -252,6 +253,40 @@ _pure_ static const char *slice_sub_state_to_string(Unit *u) {
return slice_state_to_string(SLICE(u)->state);
}
static int slice_enumerate(Manager *m) {
Unit *u;
int r;
assert(m);
u = manager_get_unit(m, SPECIAL_ROOT_SLICE);
if (!u) {
u = unit_new(m, sizeof(Slice));
if (!u)
return log_oom();
r = unit_add_name(u, SPECIAL_ROOT_SLICE);
if (r < 0) {
unit_free(u);
return log_error_errno(r, "Failed to add -.slice name");
}
}
u->default_dependencies = false;
u->no_gc = true;
SLICE(u)->deserialized_state = SLICE_ACTIVE;
if (!u->description)
u->description = strdup("Root Slice");
if (!u->documentation)
(void) strv_extend(&u->documentation, "man:systemd.special(7)");
unit_add_to_load_queue(u);
unit_add_to_dbus_queue(u);
return 0;
}
static const char* const slice_state_table[_SLICE_STATE_MAX] = {
[SLICE_DEAD] = "dead",
[SLICE_ACTIVE] = "active"
@ -293,6 +328,8 @@ const UnitVTable slice_vtable = {
.bus_set_property = bus_slice_set_property,
.bus_commit_properties = bus_slice_commit_properties,
.enumerate = slice_enumerate,
.status_message_formats = {
.finished_start_job = {
[JOB_DONE] = "Created slice %s.",

View File

@ -91,6 +91,7 @@ Unit *unit_new(Manager *m, size_t size) {
u->unit_file_state = _UNIT_FILE_STATE_INVALID;
u->unit_file_preset = -1;
u->on_failure_job_mode = JOB_REPLACE;
u->cgroup_inotify_wd = -1;
RATELIMIT_INIT(u->auto_stop_ratelimit, 10 * USEC_PER_SEC, 16);
@ -525,10 +526,7 @@ void unit_free(Unit *u) {
if (u->in_cgroup_queue)
LIST_REMOVE(cgroup_queue, u->manager->cgroup_queue, u);
if (u->cgroup_path) {
hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
free(u->cgroup_path);
}
unit_release_cgroup(u);
manager_update_failed_units(u->manager, u, false);
set_remove(u->manager->startup_units, u);
@ -1801,7 +1799,7 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
/* Make sure the cgroup is always removed when we become inactive */
if (UNIT_IS_INACTIVE_OR_FAILED(ns))
unit_destroy_cgroup_if_empty(u);
unit_prune_cgroup(u);
/* Note that this doesn't apply to RemainAfterExit services exiting
* successfully, since there's no change of state in that case. Which is
@ -2028,70 +2026,7 @@ void unit_unwatch_all_pids(Unit *u) {
while (!set_isempty(u->pids))
unit_unwatch_pid(u, PTR_TO_LONG(set_first(u->pids)));
set_free(u->pids);
u->pids = NULL;
}
static int unit_watch_pids_in_path(Unit *u, const char *path) {
_cleanup_closedir_ DIR *d = NULL;
_cleanup_fclose_ FILE *f = NULL;
int ret = 0, r;
assert(u);
assert(path);
/* Adds all PIDs from a specific cgroup path to the set of PIDs we watch. */
r = cg_enumerate_processes(SYSTEMD_CGROUP_CONTROLLER, path, &f);
if (r >= 0) {
pid_t pid;
while ((r = cg_read_pid(f, &pid)) > 0) {
r = unit_watch_pid(u, pid);
if (r < 0 && ret >= 0)
ret = r;
}
if (r < 0 && ret >= 0)
ret = r;
} else if (ret >= 0)
ret = r;
r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
if (r >= 0) {
char *fn;
while ((r = cg_read_subgroup(d, &fn)) > 0) {
_cleanup_free_ char *p = NULL;
p = strjoin(path, "/", fn, NULL);
free(fn);
if (!p)
return -ENOMEM;
r = unit_watch_pids_in_path(u, p);
if (r < 0 && ret >= 0)
ret = r;
}
if (r < 0 && ret >= 0)
ret = r;
} else if (ret >= 0)
ret = r;
return ret;
}
int unit_watch_all_pids(Unit *u) {
assert(u);
/* Adds all PIDs from our cgroup to the set of PIDs we watch */
if (!u->cgroup_path)
return -ENOENT;
return unit_watch_pids_in_path(u, u->cgroup_path);
u->pids = set_free(u->pids);
}
void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2) {
@ -2400,31 +2335,6 @@ char *unit_dbus_path(Unit *u) {
return unit_dbus_path_from_name(u->id);
}
char *unit_default_cgroup_path(Unit *u) {
_cleanup_free_ char *escaped = NULL, *slice = NULL;
int r;
assert(u);
if (unit_has_name(u, SPECIAL_ROOT_SLICE))
return strdup(u->manager->cgroup_root);
if (UNIT_ISSET(u->slice) && !unit_has_name(UNIT_DEREF(u->slice), SPECIAL_ROOT_SLICE)) {
r = cg_slice_to_path(UNIT_DEREF(u->slice)->id, &slice);
if (r < 0)
return NULL;
}
escaped = cg_escape(u->id);
if (!escaped)
return NULL;
if (slice)
return strjoin(u->manager->cgroup_root, "/", slice, "/", escaped, NULL);
else
return strjoin(u->manager->cgroup_root, "/", escaped, NULL);
}
int unit_set_slice(Unit *u, Unit *slice) {
assert(u);
assert(slice);
@ -2447,6 +2357,10 @@ int unit_set_slice(Unit *u, Unit *slice) {
if (slice->type != UNIT_SLICE)
return -EINVAL;
if (unit_has_name(u, SPECIAL_INIT_SCOPE) &&
!unit_has_name(slice, SPECIAL_ROOT_SLICE))
return -EPERM;
if (UNIT_DEREF(u->slice) == slice)
return 0;
@ -2495,7 +2409,7 @@ int unit_set_default_slice(Unit *u) {
slice_name = b;
} else
slice_name =
u->manager->running_as == MANAGER_SYSTEM
u->manager->running_as == MANAGER_SYSTEM && !unit_has_name(u, SPECIAL_INIT_SCOPE)
? SPECIAL_SYSTEM_SLICE
: SPECIAL_ROOT_SLICE;
@ -2704,40 +2618,6 @@ void unit_serialize_item(Unit *u, FILE *f, const char *key, const char *value) {
fprintf(f, "%s=%s\n", key, value);
}
static int unit_set_cgroup_path(Unit *u, const char *path) {
_cleanup_free_ char *p = NULL;
int r;
assert(u);
if (path) {
p = strdup(path);
if (!p)
return -ENOMEM;
} else
p = NULL;
if (streq_ptr(u->cgroup_path, p))
return 0;
if (p) {
r = hashmap_put(u->manager->cgroup_unit, p, u);
if (r < 0)
return r;
}
if (u->cgroup_path) {
log_unit_debug(u, "Changing cgroup path from %s to %s.", u->cgroup_path, strna(p));
hashmap_remove(u->manager->cgroup_unit, u->cgroup_path);
free(u->cgroup_path);
}
u->cgroup_path = p;
p = NULL;
return 0;
}
int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
ExecRuntime **rt = NULL;
size_t offset;
@ -2868,6 +2748,8 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
if (r < 0)
log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);
(void) unit_watch_cgroup(u);
continue;
} else if (streq(l, "cgroup-realized")) {
int b;
@ -3600,18 +3482,22 @@ int unit_kill_context(
} else if (r > 0) {
/* FIXME: For now, we will not wait for the
* cgroup members to die if we are running in
* a container or if this is a delegation
* unit, simply because cgroup notification is
* unreliable in these cases. It doesn't work
* at all in containers, and outside of
* containers it can be confused easily by
* left-over directories in the cgroup --
* which however should not exist in
* non-delegated units. */
/* FIXME: For now, on the legacy hierarchy, we
* will not wait for the cgroup members to die
* if we are running in a container or if this
* is a delegation unit, simply because cgroup
* notification is unreliable in these
* cases. It doesn't work at all in
* containers, and outside of containers it
* can be confused easily by left-over
* directories in the cgroup -- which however
* should not exist in non-delegated units. On
* the unified hierarchy that's different,
* there we get proper events. Hence rely on
* them.*/
if (detect_container(NULL) == 0 && !unit_cgroup_delegate(u))
if (cg_unified() > 0 ||
(detect_container(NULL) == 0 && !unit_cgroup_delegate(u)))
wait_for_exit = true;
if (c->send_sighup && k != KILL_KILL) {

View File

@ -184,9 +184,10 @@ struct Unit {
/* Counterparts in the cgroup filesystem */
char *cgroup_path;
CGroupControllerMask cgroup_realized_mask;
CGroupControllerMask cgroup_subtree_mask;
CGroupControllerMask cgroup_members_mask;
CGroupMask cgroup_realized_mask;
CGroupMask cgroup_subtree_mask;
CGroupMask cgroup_members_mask;
int cgroup_inotify_wd;
/* How to start OnFailure units */
JobMode on_failure_job_mode;
@ -522,7 +523,6 @@ void unit_notify(Unit *u, UnitActiveState os, UnitActiveState ns, bool reload_su
int unit_watch_pid(Unit *u, pid_t pid);
void unit_unwatch_pid(Unit *u, pid_t pid);
int unit_watch_all_pids(Unit *u);
void unit_unwatch_all_pids(Unit *u);
void unit_tidy_watch_pids(Unit *u, pid_t except1, pid_t except2);
@ -567,8 +567,6 @@ bool unit_active_or_pending(Unit *u);
int unit_add_default_target_dependency(Unit *u, Unit *target);
char *unit_default_cgroup_path(Unit *u);
void unit_start_on_failure(Unit *u);
void unit_trigger_notify(Unit *u);

View File

@ -204,6 +204,7 @@ static char **arg_property = NULL;
static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
static bool arg_userns = false;
static int arg_kill_signal = 0;
static bool arg_unified_cgroup_hierarchy = false;
static void help(void) {
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
@ -385,6 +386,30 @@ static int set_sanitized_path(char **b, const char *path) {
return 0;
}
static int detect_unified_cgroup_hierarchy(void) {
const char *e;
int r;
/* Allow the user to control whether the unified hierarchy is used */
e = getenv("UNIFIED_CGROUP_HIERARCHY");
if (e) {
r = parse_boolean(e);
if (r < 0)
return log_error_errno(r, "Failed to parse $UNIFIED_CGROUP_HIERARCHY.");
arg_unified_cgroup_hierarchy = r;
return 0;
}
/* Otherwise inherit the default from the host system */
r = cg_unified();
if (r < 0)
return log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
arg_unified_cgroup_hierarchy = r;
return 0;
}
static int parse_argv(int argc, char *argv[]) {
enum {
@ -1037,6 +1062,10 @@ static int parse_argv(int argc, char *argv[]) {
if (arg_boot && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
r = detect_unified_cgroup_hierarchy();
if (r < 0)
return r;
return 1;
}
@ -1095,7 +1124,6 @@ static int mount_all(const char *dest, bool userns) {
{ "/proc/sys", "/proc/sys", NULL, NULL, MS_BIND, true, true }, /* Bind mount first */
{ NULL, "/proc/sys", NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, true, true }, /* Then, make it r/o */
{ "sysfs", "/sys", "sysfs", NULL, MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true, false },
{ "tmpfs", "/sys/fs/cgroup", "tmpfs", "mode=755", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, true, false },
{ "tmpfs", "/dev", "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME, true, false },
{ "tmpfs", "/dev/shm", "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
{ "tmpfs", "/run", "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true, false },
@ -1381,7 +1409,7 @@ static int mount_custom(const char *dest) {
return 0;
}
static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
static int mount_legacy_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
char *to;
int r;
@ -1409,11 +1437,31 @@ static int mount_cgroup_hierarchy(const char *dest, const char *controller, cons
return 1;
}
static int mount_cgroup(const char *dest) {
static int mount_legacy_cgroups(const char *dest) {
_cleanup_set_free_free_ Set *controllers = NULL;
const char *cgroup_root;
int r;
cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
/* Mount a tmpfs to /sys/fs/cgroup if it's not mounted there yet. */
r = path_is_mount_point(cgroup_root, AT_SYMLINK_FOLLOW);
if (r < 0)
return log_error_errno(r, "Failed to determine if /sys/fs/cgroup is already mounted: %m");
if (r == 0) {
_cleanup_free_ char *options = NULL;
r = tmpfs_patch_options("mode=755", &options);
if (r < 0)
return log_oom();
if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options) < 0)
return log_error_errno(errno, "Failed to mount /sys/fs/cgroup: %m");
}
if (cg_unified() > 0)
goto skip_controllers;
controllers = set_new(&string_hash_ops);
if (!controllers)
return log_oom();
@ -1437,7 +1485,7 @@ static int mount_cgroup(const char *dest) {
if (r == -EINVAL) {
/* Not a symbolic link, but directly a single cgroup hierarchy */
r = mount_cgroup_hierarchy(dest, controller, controller, true);
r = mount_legacy_cgroup_hierarchy(dest, controller, controller, true);
if (r < 0)
return r;
@ -1457,7 +1505,7 @@ static int mount_cgroup(const char *dest) {
continue;
}
r = mount_cgroup_hierarchy(dest, combined, combined, true);
r = mount_legacy_cgroup_hierarchy(dest, combined, combined, true);
if (r < 0)
return r;
@ -1471,17 +1519,52 @@ static int mount_cgroup(const char *dest) {
}
}
r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
skip_controllers:
r = mount_legacy_cgroup_hierarchy(dest, "none,name=systemd,xattr", "systemd", false);
if (r < 0)
return r;
cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
return 0;
}
static int mount_unified_cgroups(const char *dest) {
const char *p;
int r;
assert(dest);
p = strjoina(dest, "/sys/fs/cgroup");
r = path_is_mount_point(p, AT_SYMLINK_FOLLOW);
if (r < 0)
return log_error_errno(r, "Failed to determine if %s is mounted already: %m", p);
if (r > 0) {
p = strjoina(dest, "/sys/fs/cgroup/cgroup.procs");
if (access(p, F_OK) >= 0)
return 0;
if (errno != ENOENT)
return log_error_errno(errno, "Failed to determine if mount point %s contains the unified cgroup hierarchy: %m", p);
log_error("%s is already mounted but not a unified cgroup hierarchy. Refusing.", p);
return -EINVAL;
}
if (mount("cgroup", p, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior") < 0)
return log_error_errno(errno, "Failed to mount unified cgroup hierarchy to %s: %m", p);
return 0;
}
static int mount_cgroups(const char *dest) {
if (arg_unified_cgroup_hierarchy)
return mount_unified_cgroups(dest);
else
return mount_legacy_cgroups(dest);
}
static int mount_systemd_cgroup_writable(const char *dest) {
_cleanup_free_ char *own_cgroup_path = NULL;
const char *systemd_root, *systemd_own;
@ -1493,13 +1576,23 @@ static int mount_systemd_cgroup_writable(const char *dest) {
if (r < 0)
return log_error_errno(r, "Failed to determine our own cgroup path: %m");
/* If we are living in the top-level, then there's nothing to do... */
if (path_equal(own_cgroup_path, "/"))
return 0;
if (arg_unified_cgroup_hierarchy) {
systemd_own = strjoina(dest, "/sys/fs/cgroup", own_cgroup_path);
systemd_root = prefix_roota(dest, "/sys/fs/cgroup");
} else {
systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
}
/* Make our own cgroup a (writable) bind mount */
systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
if (mount(systemd_own, systemd_own, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
/* And then remount the systemd cgroup root read-only */
systemd_root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
@ -4187,6 +4280,8 @@ static int inner_child(
assert(directory);
assert(kmsg_socket >= 0);
cg_unified_flush();
if (arg_userns) {
/* Tell the parent, that it now can write the UID map. */
(void) barrier_place(barrier); /* #1 */
@ -4368,6 +4463,8 @@ static int outer_child(
assert(pid_socket >= 0);
assert(kmsg_socket >= 0);
cg_unified_flush();
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
@ -4484,7 +4581,7 @@ static int outer_child(
if (r < 0)
return r;
r = mount_cgroup(directory);
r = mount_cgroups(directory);
if (r < 0)
return r;
@ -4499,7 +4596,6 @@ static int outer_child(
NULL);
if (pid < 0)
return log_error_errno(errno, "Failed to fork inner child: %m");
if (pid == 0) {
pid_socket = safe_close(pid_socket);
uid_shift_socket = safe_close(uid_shift_socket);
@ -4567,9 +4663,112 @@ static int chown_cgroup(pid_t pid) {
if (fd < 0)
return log_error_errno(errno, "Failed to open %s: %m", fs);
FOREACH_STRING(fn, ".", "tasks", "notify_on_release", "cgroup.procs", "cgroup.clone_children")
FOREACH_STRING(fn,
".",
"tasks",
"notify_on_release",
"cgroup.procs",
"cgroup.clone_children",
"cgroup.controllers",
"cgroup.subtree_control",
"cgroup.populated")
if (fchownat(fd, fn, arg_uid_shift, arg_uid_shift, 0) < 0)
log_warning_errno(errno, "Failed to chown() cgroup file %s, ignoring: %m", fn);
log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
"Failed to chown() cgroup file %s, ignoring: %m", fn);
return 0;
}
static int sync_cgroup(pid_t pid) {
_cleanup_free_ char *cgroup = NULL;
char tree[] = "/tmp/unifiedXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
bool undo_mount = false;
const char *fn;
int unified, r;
unified = cg_unified();
if (unified < 0)
return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
if ((unified > 0) == arg_unified_cgroup_hierarchy)
return 0;
/* When the host uses the legacy cgroup setup, but the
* container shall use the unified hierarchy, let's make sure
* we copy the path from the name=systemd hierarchy into the
* unified hierarchy. Similar for the reverse situation. */
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
if (r < 0)
return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
/* In order to access the unified hierarchy we need to mount it */
if (!mkdtemp(tree))
return log_error_errno(errno, "Failed to generate temporary mount point for unified hierarchy: %m");
if (unified)
r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
else
r = mount("cgroup", tree, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, "__DEVEL__sane_behavior");
if (r < 0) {
r = log_error_errno(errno, "Failed to mount unified hierarchy: %m");
goto finish;
}
undo_mount = true;
fn = strjoina(tree, cgroup, "/cgroup.procs");
(void) mkdir_parents(fn, 0755);
sprintf(pid_string, PID_FMT, pid);
r = write_string_file(fn, pid_string, 0);
if (r < 0)
log_error_errno(r, "Failed to move process: %m");
finish:
if (undo_mount)
(void) umount(tree);
(void) rmdir(tree);
return r;
}
static int create_subcgroup(pid_t pid) {
_cleanup_free_ char *cgroup = NULL;
const char *child;
int unified, r;
/* In the unified hierarchy inner nodes may only only contain
* subgroups, but not processes. Hence, if we running in the
* unified hierarchy and the container does the same, and we
* did not create a scope unit for the container move us and
* the container into two separate subcgroups. */
if (!arg_keep_unit)
return 0;
if (!arg_unified_cgroup_hierarchy)
return 0;
unified = cg_unified();
if (unified < 0)
return log_error_errno(unified, "Failed to determine whether the unified hierachy is used: %m");
if (unified == 0)
return 0;
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
if (r < 0)
return log_error_errno(r, "Failed to get our control group: %m");
child = strjoina(cgroup, "/payload");
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, pid);
if (r < 0)
return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
child = strjoina(cgroup, "/supervisor");
r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, child, 0);
if (r < 0)
return log_error_errno(r, "Failed to create %s subcgroup: %m", child);
return 0;
}
@ -4976,6 +5175,14 @@ int main(int argc, char *argv[]) {
if (r < 0)
goto finish;
r = sync_cgroup(pid);
if (r < 0)
goto finish;
r = create_subcgroup(pid);
if (r < 0)
goto finish;
r = chown_cgroup(pid);
if (r < 0)
goto finish;

View File

@ -61,36 +61,36 @@ static int test_cgroup_mask(void) {
root = UNIT_DEREF(parent->slice);
/* Verify per-unit cgroups settings. */
assert_se(unit_get_cgroup_mask(son) == (CGROUP_CPU | CGROUP_CPUACCT));
assert_se(unit_get_cgroup_mask(daughter) == 0);
assert_se(unit_get_cgroup_mask(grandchild) == 0);
assert_se(unit_get_cgroup_mask(parent_deep) == CGROUP_MEMORY);
assert_se(unit_get_cgroup_mask(parent) == CGROUP_BLKIO);
assert_se(unit_get_cgroup_mask(root) == 0);
assert_se(unit_get_own_mask(son) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT));
assert_se(unit_get_own_mask(daughter) == 0);
assert_se(unit_get_own_mask(grandchild) == 0);
assert_se(unit_get_own_mask(parent_deep) == CGROUP_MASK_MEMORY);
assert_se(unit_get_own_mask(parent) == CGROUP_MASK_BLKIO);
assert_se(unit_get_own_mask(root) == 0);
/* Verify aggregation of member masks */
assert_se(unit_get_members_mask(son) == 0);
assert_se(unit_get_members_mask(daughter) == 0);
assert_se(unit_get_members_mask(grandchild) == 0);
assert_se(unit_get_members_mask(parent_deep) == 0);
assert_se(unit_get_members_mask(parent) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
assert_se(unit_get_members_mask(root) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY));
assert_se(unit_get_members_mask(parent) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
assert_se(unit_get_members_mask(root) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
/* Verify aggregation of sibling masks. */
assert_se(unit_get_siblings_mask(son) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
assert_se(unit_get_siblings_mask(daughter) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
assert_se(unit_get_siblings_mask(son) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
assert_se(unit_get_siblings_mask(daughter) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
assert_se(unit_get_siblings_mask(grandchild) == 0);
assert_se(unit_get_siblings_mask(parent_deep) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY));
assert_se(unit_get_siblings_mask(parent) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY));
assert_se(unit_get_siblings_mask(root) == (CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY));
assert_se(unit_get_siblings_mask(parent_deep) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY));
assert_se(unit_get_siblings_mask(parent) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
assert_se(unit_get_siblings_mask(root) == (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
/* Verify aggregation of target masks. */
assert_se(unit_get_target_mask(son) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(daughter) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(son) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(daughter) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(grandchild) == 0);
assert_se(unit_get_target_mask(parent_deep) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(parent) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(root) == ((CGROUP_CPU | CGROUP_CPUACCT | CGROUP_BLKIO | CGROUP_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(parent_deep) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(parent) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
assert_se(unit_get_target_mask(root) == ((CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
manager_free(m);

View File

@ -74,8 +74,8 @@ int main(int argc, char*argv[]) {
cg_trim(SYSTEMD_CGROUP_CONTROLLER, "/", false);
assert_se(cg_delete(SYSTEMD_CGROUP_CONTROLLER, "/test-b") < 0);
assert_se(cg_delete(SYSTEMD_CGROUP_CONTROLLER, "/test-a") >= 0);
assert_se(cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, "/test-b") < 0);
assert_se(cg_rmdir(SYSTEMD_CGROUP_CONTROLLER, "/test-a") >= 0);
assert_se(cg_split_spec("foobar:/", &c, &p) == 0);
assert_se(streq(c, "foobar"));