mirror of
https://github.com/systemd/systemd.git
synced 2024-12-22 17:35:35 +03:00
cgroup: Add DisableControllers= directive to disable controller in subtree
Some controllers (like the CPU controller) have a performance cost that is non-trivial on certain workloads. While this can be mitigated and improved to an extent, there will for some controllers always be some overheads associated with the benefits gained from the controller. Inside Facebook, the fix applied has been to disable the CPU controller forcibly with `cgroup_disable=cpu` on the kernel command line. This presents a problem: to disable or reenable the controller, a reboot is required, but this is quite cumbersome and slow to do for many thousands of machines, especially machines where disabling/enabling a stateful service on a machine is a matter of several minutes. Currently systemd provides some configuration knobs for these in the form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the like. The limitation of these is that Default*Accounting is overrideable by individual services, of which any one could decide to reenable a controller within the hierarchy at any point just by using a controller feature implicitly (eg. `CPUWeight`), even if the use of that CPU feature could just be opportunistic. Since many services are provided by the distribution, or by upstream teams at a particular organisation, it's not a sustainable solution to simply try to find and remove offending directives from these units. This commit presents a more direct solution -- a DisableControllers= directive that forcibly disallows a controller from being enabled within a subtree.
This commit is contained in:
parent
4f6f62e468
commit
c72703e26d
@ -736,6 +736,27 @@
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>DisableControllers=</varname></term>
|
||||
|
||||
<listitem>
|
||||
<para>Disables controllers from being enabled for a unit's children. If a controller listed is already in use
|
||||
in its subtree, the controller will be removed from the subtree. This can be used to avoid child units being
|
||||
able to implicitly or explicitly enable a controller. Defaults to not disabling any controllers.</para>
|
||||
|
||||
<para>It may not be possible to successfully disable a controller if the unit or any child of the unit in
|
||||
question delegates controllers to its children, as any delegated subtree of the cgroup hierarchy is unmanaged
|
||||
by systemd.</para>
|
||||
|
||||
<para>Multiple controllers may be specified, separated by spaces. You may also pass
|
||||
<varname>DisableControllers=</varname> multiple times, in which case each new instance adds another controller
|
||||
to disable. Passing <varname>DisableControllers=</varname> by itself with no controller name present resets
|
||||
the disabled controller list.</para>
|
||||
|
||||
<para>Valid controllers are <option>cpu</option>, <option>cpuacct</option>, <option>io</option>,
|
||||
<option>blkio</option>, <option>memory</option>, <option>devices</option>, and <option>pids</option>.</para>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
|
@ -1346,7 +1346,7 @@ CGroupMask unit_get_own_mask(Unit *u) {
|
||||
if (!c)
|
||||
return 0;
|
||||
|
||||
return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u);
|
||||
return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u);
|
||||
}
|
||||
|
||||
CGroupMask unit_get_delegate_mask(Unit *u) {
|
||||
@ -1461,6 +1461,7 @@ CGroupMask unit_get_target_mask(Unit *u) {
|
||||
|
||||
mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u);
|
||||
mask &= u->manager->cgroup_supported;
|
||||
mask &= ~unit_get_ancestor_disable_mask(u);
|
||||
|
||||
return mask;
|
||||
}
|
||||
@ -1475,6 +1476,7 @@ CGroupMask unit_get_enable_mask(Unit *u) {
|
||||
|
||||
mask = unit_get_members_mask(u);
|
||||
mask &= u->manager->cgroup_supported;
|
||||
mask &= ~unit_get_ancestor_disable_mask(u);
|
||||
|
||||
return mask;
|
||||
}
|
||||
@ -1914,8 +1916,8 @@ static bool unit_has_mask_enables_realized(
|
||||
* we want to add is already added. */
|
||||
|
||||
return u->cgroup_realized &&
|
||||
((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == u->cgroup_realized_mask &&
|
||||
((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == u->cgroup_enabled_mask;
|
||||
((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) &&
|
||||
((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2);
|
||||
}
|
||||
|
||||
void unit_add_to_cgroup_realize_queue(Unit *u) {
|
||||
@ -1965,11 +1967,7 @@ static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) {
|
||||
new_target_mask = u->cgroup_realized_mask | target_mask;
|
||||
new_enable_mask = u->cgroup_enabled_mask | enable_mask;
|
||||
|
||||
r = unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
return 0;
|
||||
return unit_create_cgroup(u, new_target_mask, new_enable_mask, state);
|
||||
}
|
||||
|
||||
/* Controllers can only be disabled depth-first, from the leaves of the
|
||||
@ -2043,7 +2041,7 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) {
|
||||
* h i j k l m n o
|
||||
*
|
||||
* 1. We want to realise cgroup "d" now.
|
||||
* 2. cgroup "a" has DisableController=cpu in the associated unit.
|
||||
* 2. cgroup "a" has DisableControllers=cpu in the associated unit.
|
||||
* 3. cgroup "k" just started requesting the memory controller.
|
||||
*
|
||||
* To make this work we must do the following in order:
|
||||
|
@ -118,6 +118,8 @@ struct CGroupContext {
|
||||
|
||||
bool delegate;
|
||||
CGroupMask delegate_controllers;
|
||||
|
||||
CGroupMask disable_controllers;
|
||||
};
|
||||
|
||||
/* Used when querying IP accounting data */
|
||||
@ -151,6 +153,9 @@ CGroupMask unit_get_delegate_mask(Unit *u);
|
||||
CGroupMask unit_get_members_mask(Unit *u);
|
||||
CGroupMask unit_get_siblings_mask(Unit *u);
|
||||
CGroupMask unit_get_subtree_mask(Unit *u);
|
||||
CGroupMask unit_get_disable_mask(Unit *u);
|
||||
CGroupMask unit_get_ancestor_disable_mask(Unit *u);
|
||||
|
||||
CGroupMask unit_get_target_mask(Unit *u);
|
||||
CGroupMask unit_get_enable_mask(Unit *u);
|
||||
|
||||
|
@ -17,6 +17,38 @@
|
||||
|
||||
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
|
||||
|
||||
static int property_get_cgroup_mask(
|
||||
sd_bus *bus,
|
||||
const char *path,
|
||||
const char *interface,
|
||||
const char *property,
|
||||
sd_bus_message *reply,
|
||||
void *userdata,
|
||||
sd_bus_error *error) {
|
||||
|
||||
CGroupMask *mask = userdata;
|
||||
CGroupController ctrl;
|
||||
int r;
|
||||
|
||||
assert(bus);
|
||||
assert(reply);
|
||||
|
||||
r = sd_bus_message_open_container(reply, 'a', "s");
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
for (ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) {
|
||||
if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0)
|
||||
continue;
|
||||
|
||||
r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl));
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
return sd_bus_message_close_container(reply);
|
||||
}
|
||||
|
||||
static int property_get_delegate_controllers(
|
||||
sd_bus *bus,
|
||||
const char *path,
|
||||
@ -27,8 +59,6 @@ static int property_get_delegate_controllers(
|
||||
sd_bus_error *error) {
|
||||
|
||||
CGroupContext *c = userdata;
|
||||
CGroupController cc;
|
||||
int r;
|
||||
|
||||
assert(bus);
|
||||
assert(reply);
|
||||
@ -37,20 +67,7 @@ static int property_get_delegate_controllers(
|
||||
if (!c->delegate)
|
||||
return sd_bus_message_append(reply, "as", 0);
|
||||
|
||||
r = sd_bus_message_open_container(reply, 'a', "s");
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
for (cc = 0; cc < _CGROUP_CONTROLLER_MAX; cc++) {
|
||||
if ((c->delegate_controllers & CGROUP_CONTROLLER_TO_MASK(cc)) == 0)
|
||||
continue;
|
||||
|
||||
r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(cc));
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
return sd_bus_message_close_container(reply);
|
||||
return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error);
|
||||
}
|
||||
|
||||
static int property_get_io_device_weight(
|
||||
@ -342,6 +359,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
|
||||
SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0),
|
||||
SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0),
|
||||
SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0),
|
||||
SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0),
|
||||
SD_BUS_VTABLE_END
|
||||
};
|
||||
|
||||
|
@ -192,6 +192,7 @@ $1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0,
|
||||
$1.TasksAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.tasks_accounting)
|
||||
$1.TasksMax, config_parse_tasks_max, 0, offsetof($1, cgroup_context.tasks_max)
|
||||
$1.Delegate, config_parse_delegate, 0, offsetof($1, cgroup_context)
|
||||
$1.DisableControllers, config_parse_disable_controllers, 0, offsetof($1, cgroup_context)
|
||||
$1.IPAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.ip_accounting)
|
||||
$1.IPAddressAllow, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_allow)
|
||||
$1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny)
|
||||
|
@ -4323,6 +4323,41 @@ int config_parse_exit_status(
|
||||
return 0;
|
||||
}
|
||||
|
||||
int config_parse_disable_controllers(
|
||||
const char *unit,
|
||||
const char *filename,
|
||||
unsigned line,
|
||||
const char *section,
|
||||
unsigned section_line,
|
||||
const char *lvalue,
|
||||
int ltype,
|
||||
const char *rvalue,
|
||||
void *data,
|
||||
void *userdata) {
|
||||
|
||||
int r;
|
||||
CGroupContext *c = data;
|
||||
CGroupMask disabled_mask;
|
||||
|
||||
/* 1. If empty, make all controllers eligible for use again.
|
||||
* 2. If non-empty, merge all listed controllers, space separated. */
|
||||
|
||||
if (isempty(rvalue)) {
|
||||
c->disable_controllers = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
r = cg_mask_from_string(rvalue, &disabled_mask);
|
||||
if (r < 0 || disabled_mask <= 0) {
|
||||
log_syntax(unit, LOG_ERR, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue);
|
||||
return 0;
|
||||
}
|
||||
|
||||
c->disable_controllers |= disabled_mask;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define FOLLOW_MAX 8
|
||||
|
||||
static int open_follow(char **filename, FILE **_f, Set *names, char **_final) {
|
||||
|
@ -105,6 +105,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
|
||||
|
||||
/* gperf prototypes */
|
||||
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
|
||||
|
@ -30,7 +30,7 @@ static void log_cgroup_mask(CGroupMask got, CGroupMask expected) {
|
||||
static int test_cgroup_mask(void) {
|
||||
_cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL;
|
||||
_cleanup_(manager_freep) Manager *m = NULL;
|
||||
Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep;
|
||||
Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep, *nomem_parent, *nomem_leaf;
|
||||
int r;
|
||||
CGroupMask cpu_accounting_mask = get_cpu_accounting_mask();
|
||||
|
||||
@ -68,11 +68,15 @@ static int test_cgroup_mask(void) {
|
||||
assert_se(manager_load_startable_unit_or_warn(m, "daughter.service", NULL, &daughter) >= 0);
|
||||
assert_se(manager_load_startable_unit_or_warn(m, "grandchild.service", NULL, &grandchild) >= 0);
|
||||
assert_se(manager_load_startable_unit_or_warn(m, "parent-deep.slice", NULL, &parent_deep) >= 0);
|
||||
assert_se(manager_load_startable_unit_or_warn(m, "nomem.slice", NULL, &nomem_parent) >= 0);
|
||||
assert_se(manager_load_startable_unit_or_warn(m, "nomemleaf.service", NULL, &nomem_leaf) >= 0);
|
||||
assert_se(UNIT_DEREF(son->slice) == parent);
|
||||
assert_se(UNIT_DEREF(daughter->slice) == parent);
|
||||
assert_se(UNIT_DEREF(parent_deep->slice) == parent);
|
||||
assert_se(UNIT_DEREF(grandchild->slice) == parent_deep);
|
||||
assert_se(UNIT_DEREF(nomem_leaf->slice) == nomem_parent);
|
||||
root = UNIT_DEREF(parent->slice);
|
||||
assert_se(UNIT_DEREF(nomem_parent->slice) == root);
|
||||
|
||||
/* Verify per-unit cgroups settings. */
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(son), CGROUP_MASK_CPU);
|
||||
@ -80,6 +84,8 @@ static int test_cgroup_mask(void) {
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(grandchild), 0);
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent_deep), CGROUP_MASK_MEMORY);
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_parent), 0);
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(root), 0);
|
||||
|
||||
/* Verify aggregation of member masks */
|
||||
@ -88,6 +94,8 @@ static int test_cgroup_mask(void) {
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(grandchild), 0);
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent_deep), 0);
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_leaf), 0);
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
|
||||
|
||||
/* Verify aggregation of sibling masks. */
|
||||
@ -96,6 +104,8 @@ static int test_cgroup_mask(void) {
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(grandchild), 0);
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent_deep), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_parent), (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO));
|
||||
ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY));
|
||||
|
||||
/* Verify aggregation of target masks. */
|
||||
@ -104,6 +114,8 @@ static int test_cgroup_mask(void) {
|
||||
ASSERT_CGROUP_MASK(unit_get_target_mask(grandchild), 0);
|
||||
ASSERT_CGROUP_MASK(unit_get_target_mask(parent_deep), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported));
|
||||
ASSERT_CGROUP_MASK(unit_get_target_mask(parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
|
||||
ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported));
|
||||
ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_leaf), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported));
|
||||
ASSERT_CGROUP_MASK(unit_get_target_mask(root), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported));
|
||||
|
||||
return 0;
|
||||
|
@ -18,6 +18,8 @@ test_data_files = '''
|
||||
hwdb/10-bad.hwdb
|
||||
journal-data/journal-1.txt
|
||||
journal-data/journal-2.txt
|
||||
nomem.slice
|
||||
nomemleaf.service
|
||||
parent-deep.slice
|
||||
parent.slice
|
||||
sched_idle_bad.service
|
||||
|
5
test/nomem.slice
Normal file
5
test/nomem.slice
Normal file
@ -0,0 +1,5 @@
|
||||
[Unit]
|
||||
Description=Nomem Parent Slice
|
||||
|
||||
[Slice]
|
||||
DisableControllers=memory
|
9
test/nomemleaf.service
Normal file
9
test/nomemleaf.service
Normal file
@ -0,0 +1,9 @@
|
||||
[Unit]
|
||||
Description=Nomem Leaf Service
|
||||
|
||||
[Service]
|
||||
Slice=nomem.slice
|
||||
Type=oneshot
|
||||
ExecStart=/bin/true
|
||||
IOWeight=200
|
||||
MemoryAccounting=true
|
Loading…
Reference in New Issue
Block a user