1
0
mirror of https://github.com/systemd/systemd.git synced 2025-01-10 05:18:17 +03:00

Merge pull request #18401 from anitazha/oomdxattr

oomd: implement avoid/omit support for cgroups
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2021-02-13 10:00:31 +01:00 committed by GitHub
commit b3c57df0f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 380 additions and 41 deletions

View File

@ -273,6 +273,7 @@ All cgroup/resource control settings are available for transient units
✓ ManagedOOMSwap=
✓ ManagedOOMMemoryPressure=
✓ ManagedOOMMemoryPressureLimit=
✓ ManagedOOMPreference=
```
## Process Killing Settings

View File

@ -2450,6 +2450,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
readonly s ManagedOOMMemoryPressure = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -2974,6 +2976,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
<!--property ManagedOOMPreference is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -3538,6 +3542,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -4204,6 +4210,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
readonly s ManagedOOMMemoryPressure = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -4756,6 +4764,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
<!--property ManagedOOMPreference is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -5318,6 +5328,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -5897,6 +5909,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
readonly s ManagedOOMMemoryPressure = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -6377,6 +6391,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
<!--property ManagedOOMPreference is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -6857,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -7557,6 +7575,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
readonly s ManagedOOMMemoryPressure = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly as Environment = ['...', ...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -8023,6 +8043,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
<!--property ManagedOOMPreference is not documented!-->
<!--property EnvironmentFiles is not documented!-->
<!--property PassEnvironment is not documented!-->
@ -8489,6 +8511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="Environment"/>
<variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/>
@ -9042,6 +9066,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
readonly s ManagedOOMMemoryPressure = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
};
interface org.freedesktop.DBus.Peer { ... };
interface org.freedesktop.DBus.Introspectable { ... };
@ -9178,6 +9204,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
<!--property ManagedOOMPreference is not documented!-->
<!--Autogenerated cross-references for systemd.directives, do not edit-->
<variablelist class="dbus-interface" generated="True" extra-ref="org.freedesktop.systemd1.Unit"/>
@ -9318,6 +9346,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<!--End of Autogenerated section-->
<refsect2>
@ -9477,6 +9507,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
readonly s ManagedOOMMemoryPressure = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly u ManagedOOMMemoryPressureLimitPermyriad = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s ManagedOOMPreference = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s KillMode = '...';
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
@ -9629,6 +9661,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<!--property ManagedOOMMemoryPressureLimitPermyriad is not documented!-->
<!--property ManagedOOMPreference is not documented!-->
<!--property KillMode is not documented!-->
<!--property KillSignal is not documented!-->
@ -9795,6 +9829,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMMemoryPressureLimitPermyriad"/>
<variablelist class="dbus-property" generated="True" extra-ref="ManagedOOMPreference"/>
<variablelist class="dbus-property" generated="True" extra-ref="KillMode"/>
<variablelist class="dbus-property" generated="True" extra-ref="KillSignal"/>

View File

@ -913,6 +913,38 @@ DeviceAllow=/dev/loop-control
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><varname>ManagedOOMPreference=none|avoid|omit</varname></term>
<listitem>
<para>Allows deprioritizing or omitting this unit's cgroup as a candidate when <command>systemd-oomd</command>
needs to act. Requires support for extended attributes (see
<citerefentry project='man-pages'><refentrytitle>xattr</refentrytitle><manvolnum>7</manvolnum></citerefentry>)
in order to use <option>avoid</option> or <option>omit</option>. Additionally, <command>systemd-oomd</command>
will ignore these extended attributes if the unit's cgroup is not owned by the root user.</para>
<para>If this property is set to <option>avoid</option>, the service manager will set the
"user.oomd_avoid" extended attribute on the unit's cgroup to "1". If <command>systemd-oomd</command> sees
this extended attribute on a cgroup set to "1" when choosing between candidates, it will only select the
cgroup with "user.oomd_avoid" if there are no other viable candidates.</para>
<para>If this property is set to <option>omit</option>, the service manager will set the "user.oomd_omit"
extended attribute on the unit's cgroup to "1". If <command>systemd-oomd</command> sees the this extended
attribute on the cgroup set to "1", it will ignore the cgroup as a candidate and will not perform any actions
on the cgroup.</para>
<para>It is recommended to use <option>avoid</option> and <option>omit</option> sparingly as it can adversely
affect <command>systemd-oomd</command>'s kill behavior. Also note that these extended attributes are not
applied recursively to cgroups under this unit's cgroup.</para>
<para>Defaults to <option>none</option> which means no extended attributes will be set and systemd-oomd will
sort this unit's cgroup as defined in
<citerefentry><refentrytitle>systemd-oomd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry>
and <citerefentry><refentrytitle>oomd.conf</refentrytitle><manvolnum>5</manvolnum></citerefentry> (if this
unit's cgroup becomes a candidate).</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>

View File

@ -635,6 +635,20 @@ int cg_get_xattr_malloc(const char *controller, const char *path, const char *na
return r;
}
int cg_get_xattr_bool(const char *controller, const char *path, const char *name) {
_cleanup_free_ char *val = NULL;
int r;
assert(path);
assert(name);
r = cg_get_xattr_malloc(controller, path, name, &val);
if (r < 0)
return r;
return parse_boolean(val);
}
int cg_remove_xattr(const char *controller, const char *path, const char *name) {
_cleanup_free_ char *fs = NULL;
int r;
@ -1703,6 +1717,25 @@ int cg_get_attribute_as_bool(const char *controller, const char *path, const cha
return 0;
}
int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid) {
_cleanup_free_ char *f = NULL;
struct stat stats;
int r;
assert(ret_uid);
r = cg_get_path(controller, path, NULL, &f);
if (r < 0)
return r;
r = stat(f, &stats);
if (r < 0)
return -errno;
*ret_uid = stats.st_uid;
return 0;
}
int cg_get_keyed_attribute_full(
const char *controller,
const char *path,
@ -2185,3 +2218,11 @@ static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = {
};
DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode);
static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = {
[MANAGED_OOM_PREFERENCE_NONE] = "none",
[MANAGED_OOM_PREFERENCE_AVOID] = "avoid",
[MANAGED_OOM_PREFERENCE_OMIT] = "omit",
};
DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference);

View File

@ -212,10 +212,13 @@ int cg_get_attribute_as_uint64(const char *controller, const char *path, const c
int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret);
int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid);
int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid);
int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags);
int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size);
int cg_get_xattr_malloc(const char *controller, const char *path, const char *name, char **ret);
/* Returns negative on error, and 0 or 1 on success for the bool value */
int cg_get_xattr_bool(const char *controller, const char *path, const char *name);
int cg_remove_xattr(const char *controller, const char *path, const char *name);
int cg_install_release_agent(const char *controller, const char *agent);
@ -288,3 +291,14 @@ typedef enum ManagedOOMMode {
const char* managed_oom_mode_to_string(ManagedOOMMode m) _const_;
ManagedOOMMode managed_oom_mode_from_string(const char *s) _pure_;
typedef enum ManagedOOMPreference {
MANAGED_OOM_PREFERENCE_NONE = 0,
MANAGED_OOM_PREFERENCE_AVOID = 1,
MANAGED_OOM_PREFERENCE_OMIT = 2,
_MANAGED_OOM_PREFERENCE_MAX,
_MANAGED_OOM_PREFERENCE_INVALID = -1
} ManagedOOMPreference;
const char* managed_oom_preference_to_string(ManagedOOMPreference a) _const_;
ManagedOOMPreference managed_oom_preference_from_string(const char *s) _pure_;

View File

@ -131,6 +131,7 @@ void cgroup_context_init(CGroupContext *c) {
.moom_swap = MANAGED_OOM_AUTO,
.moom_mem_pressure = MANAGED_OOM_AUTO,
.moom_preference = MANAGED_OOM_PREFERENCE_NONE,
};
}
@ -417,7 +418,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
"%sDelegate: %s\n"
"%sManagedOOMSwap: %s\n"
"%sManagedOOMMemoryPressure: %s\n"
"%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n",
"%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n"
"%sManagedOOMPreference: %s%%\n",
prefix, yes_no(c->cpu_accounting),
prefix, yes_no(c->io_accounting),
prefix, yes_no(c->blockio_accounting),
@ -450,7 +452,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) {
prefix, yes_no(c->delegate),
prefix, managed_oom_mode_to_string(c->moom_swap),
prefix, managed_oom_mode_to_string(c->moom_mem_pressure),
prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100);
prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100,
prefix, managed_oom_preference_to_string(c->moom_preference));
if (c->delegate) {
_cleanup_free_ char *t = NULL;
@ -600,6 +603,41 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode)
UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low);
UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min);
void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path) {
CGroupContext *c;
int r;
assert(u);
c = unit_get_cgroup_context(u);
if (!c)
return;
if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT) {
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit", "1", 1, 0);
if (r < 0)
log_unit_debug_errno(u, r, "Failed to set oomd_omit flag on control group %s, ignoring: %m", cgroup_path);
}
if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID) {
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid", "1", 1, 0);
if (r < 0)
log_unit_debug_errno(u, r, "Failed to set oomd_avoid flag on control group %s, ignoring: %m", cgroup_path);
}
if (c->moom_preference != MANAGED_OOM_PREFERENCE_AVOID) {
r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid");
if (r != -ENODATA)
log_unit_debug_errno(u, r, "Failed to remove oomd_avoid flag on control group %s, ignoring: %m", cgroup_path);
}
if (c->moom_preference != MANAGED_OOM_PREFERENCE_OMIT) {
r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit");
if (r != -ENODATA)
log_unit_debug_errno(u, r, "Failed to remove oomd_omit flag on control group %s, ignoring: %m", cgroup_path);
}
}
static void cgroup_xattr_apply(Unit *u) {
char ids[SD_ID128_STRING_MAX];
int r;
@ -630,6 +668,8 @@ static void cgroup_xattr_apply(Unit *u) {
if (r != -ENODATA)
log_unit_debug_errno(u, r, "Failed to remove delegate flag on control group %s, ignoring: %m", u->cgroup_path);
}
cgroup_oomd_xattr_apply(u, u->cgroup_path);
}
static int lookup_block_device(const char *p, dev_t *ret) {
@ -2746,7 +2786,7 @@ int unit_check_oomd_kill(Unit *u) {
else if (r == 0)
return 0;
r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value);
r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value);
if (r < 0 && r != -ENODATA)
return r;
@ -3737,12 +3777,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action) {
return 1;
}
static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
[CGROUP_DEVICE_POLICY_AUTO] = "auto",
[CGROUP_DEVICE_POLICY_CLOSED] = "closed",
[CGROUP_DEVICE_POLICY_STRICT] = "strict",
};
int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
_cleanup_free_ char *v = NULL;
int r;
@ -3771,6 +3805,12 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) {
return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL);
}
static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = {
[CGROUP_DEVICE_POLICY_AUTO] = "auto",
[CGROUP_DEVICE_POLICY_CLOSED] = "closed",
[CGROUP_DEVICE_POLICY_STRICT] = "strict",
};
DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy);
static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = {

View File

@ -164,6 +164,7 @@ struct CGroupContext {
ManagedOOMMode moom_swap;
ManagedOOMMode moom_mem_pressure;
uint32_t moom_mem_pressure_limit_permyriad;
ManagedOOMPreference moom_preference;
};
/* Used when querying IP accounting data */
@ -204,6 +205,8 @@ void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockI
int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode);
void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path);
CGroupMask unit_get_own_mask(Unit *u);
CGroupMask unit_get_delegate_mask(Unit *u);
CGroupMask unit_get_members_mask(Unit *u);

View File

@ -21,6 +21,7 @@ BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_res
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode);
static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference);
static int property_get_cgroup_mask(
sd_bus *bus,
@ -395,6 +396,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = {
SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0),
SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0),
SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPermyriad", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit_permyriad), 0),
SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0),
SD_BUS_VTABLE_END
};
@ -1720,6 +1722,26 @@ int bus_cgroup_set_property(
return 1;
}
if (streq(name, "ManagedOOMPreference")) {
ManagedOOMPreference p;
const char *pref;
r = sd_bus_message_read(message, "s", &pref);
if (r < 0)
return r;
p = managed_oom_preference_from_string(pref);
if (p < 0)
return -EINVAL;
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
c->moom_preference = p;
unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref);
}
return 1;
}
if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB))
return bus_cgroup_set_transient_property(u, c, name, message, flags, error);

View File

@ -4706,6 +4706,10 @@ int exec_spawn(Unit *unit,
r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
if (r < 0)
return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
/* Normally we would not propagate the oomd xattrs to children but since we created this
* sub-cgroup internally we should do it. */
cgroup_oomd_xattr_apply(unit, subcgroup_path);
}
}

View File

@ -230,6 +230,7 @@ $1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs,
$1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap)
$1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure)
$1.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit_permyriad)
$1.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof($1, cgroup_context.moom_preference)
$1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0'
)m4_dnl
Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description)

View File

@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceR
DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode");
DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference=");
DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");

View File

@ -78,6 +78,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max);
CONFIG_PARSER_PROTOTYPE(config_parse_delegate);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit);
CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference);
CONFIG_PARSER_PROTOTYPE(config_parse_device_policy);
CONFIG_PARSER_PROTOTYPE(config_parse_device_allow);
CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency);

View File

@ -93,7 +93,7 @@ static int process_managed_oom_reply(
m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
if (reply.mode == MANAGED_OOM_AUTO) {
(void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, reply.path));
(void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path)));
continue;
}
@ -109,7 +109,7 @@ static int process_managed_oom_reply(
}
}
ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path);
ret = oomd_insert_cgroup_context(NULL, monitor_hm, empty_to_root(reply.path));
if (ret == -ENOMEM) {
r = ret;
goto finish;
@ -117,7 +117,7 @@ static int process_managed_oom_reply(
/* Always update the limit in case it was changed. For non-memory pressure detection the value is
* ignored so always updating it here is not a problem. */
ctx = hashmap_get(monitor_hm, reply.path);
ctx = hashmap_get(monitor_hm, empty_to_root(reply.path));
if (ctx)
ctx->mem_pressure_limit = limit;
}

View File

@ -3,7 +3,6 @@
#include <sys/xattr.h>
#include <unistd.h>
#include "cgroup-util.h"
#include "fd-util.h"
#include "format-util.h"
#include "oomd-util.h"
@ -159,7 +158,8 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha
return -ENOMEM;
HASHMAP_FOREACH(item, h) {
if (item->path && prefix && !path_startswith(item->path, prefix))
/* Skip over cgroups that are not valid candidates or are explicitly marked for omission */
if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->preference == MANAGED_OOM_PREFERENCE_OMIT)
continue;
sorted[k++] = item;
@ -201,9 +201,9 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
if (r < 0)
return r;
r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed));
r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed));
if (r < 0)
log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m");
log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m");
return set_size(pids_killed) != 0;
}
@ -214,13 +214,15 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
assert(h);
r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted);
r = oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, prefix, &sorted);
if (r < 0)
return r;
for (int i = 0; i < r; i++) {
if (sorted[i]->pgscan == 0)
break;
/* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */
/* Don't break since there might be "avoid" cgroups at the end. */
if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0)
continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
if (r > 0 || r == -ENOMEM)
@ -243,8 +245,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) {
/* Try to kill cgroups with non-zero swap usage until we either succeed in
* killing or we get to a cgroup with no swap usage. */
for (int i = 0; i < r; i++) {
/* Skip over cgroups with no resource usage. Don't break since there might be "avoid"
* cgroups at the end. */
if (sorted[i]->swap_usage == 0)
break;
continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
if (r > 0 || r == -ENOMEM)
@ -258,6 +262,7 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
_cleanup_free_ char *p = NULL, *val = NULL;
bool is_root;
uid_t uid;
int r;
assert(path);
@ -268,6 +273,7 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
return -ENOMEM;
is_root = empty_or_root(path);
ctx->preference = MANAGED_OOM_PREFERENCE_NONE;
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p);
if (r < 0)
@ -277,6 +283,23 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) {
if (r < 0)
return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p);
r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid);
if (r < 0)
log_debug_errno(r, "Failed to get owner/group from %s: %m", path);
else if (uid == 0) {
/* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used
* as an optional feature of systemd-oomd (and the system might not even support them). */
r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid");
if (r == -ENOMEM)
return r;
ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_AVOID : ctx->preference;
r = cg_get_xattr_bool(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit");
if (r == -ENOMEM)
return r;
ctx->preference = r == 1 ? MANAGED_OOM_PREFERENCE_OMIT : ctx->preference;
}
if (is_root) {
r = procfs_memory_get_used(&ctx->current_memory_usage);
if (r < 0)

View File

@ -3,6 +3,7 @@
#include <stdbool.h>
#include "cgroup-util.h"
#include "hashmap.h"
#include "psi-util.h"
@ -29,6 +30,8 @@ struct OomdCGroupContext {
uint64_t last_pgscan;
uint64_t pgscan;
ManagedOOMPreference preference;
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
loadavg_t mem_pressure_limit;
usec_t mem_pressure_duration_usec;
@ -61,17 +64,35 @@ bool oomd_memory_reclaim(Hashmap *h);
/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent);
static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
* (after the smallest values). */
static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
int r;
assert(c1);
assert(c2);
return CMP((*c2)->pgscan, (*c1)->pgscan);
r = CMP((*c1)->preference, (*c2)->preference);
if (r != 0)
return r;
r = CMP((*c2)->pgscan, (*c1)->pgscan);
if (r != 0)
return r;
return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage);
}
static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
int r;
assert(c1);
assert(c2);
r = CMP((*c1)->preference, (*c2)->preference);
if (r != 0)
return r;
return CMP((*c2)->swap_usage, (*c1)->swap_usage);
}

View File

@ -79,7 +79,7 @@ static void test_oomd_cgroup_kill(void) {
sleep(2);
assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true);
assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.systemd_oomd_kill", &v) >= 0);
assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_kill", &v) >= 0);
assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0);
}
}
@ -89,6 +89,8 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
_cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL;
_cleanup_free_ char *cgroup = NULL;
OomdCGroupContext *c1, *c2;
bool test_xattrs;
int r;
if (geteuid() != 0)
return (void) log_tests_skipped("not root");
@ -101,6 +103,16 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0);
/* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities
* so skip the xattr portions of the test. */
r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0);
test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r);
if (test_xattrs) {
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0);
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
}
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
assert_se(streq(ctx->path, cgroup));
@ -110,12 +122,28 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(ctx->swap_usage == 0);
assert_se(ctx->last_pgscan == 0);
assert_se(ctx->pgscan == 0);
/* omit takes precedence over avoid when both are set to true */
if (test_xattrs)
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_OMIT);
else
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
ctx = oomd_cgroup_context_free(ctx);
/* also check when only avoid is set to true */
if (test_xattrs) {
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "0", 1, 0) >= 0);
assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0);
}
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
if (test_xattrs)
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_AVOID);
ctx = oomd_cgroup_context_free(ctx);
/* Test the root cgroup */
assert_se(oomd_cgroup_context_acquire("", &ctx) == 0);
assert_se(streq(ctx->path, "/"));
assert_se(ctx->current_memory_usage > 0);
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
/* Test hashmap inserts */
assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
@ -137,6 +165,14 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(c2->last_pgscan == 5555);
assert_se(c2->mem_pressure_limit == 6789);
assert_se(c2->last_hit_mem_pressure_limit == 42);
/* Assert that avoid/omit are not set if the cgroup is not owned by root */
if (test_xattrs) {
ctx = oomd_cgroup_context_free(ctx);
assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 65534, 0) >= 0);
assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0);
assert_se(ctx->preference == MANAGED_OOM_PREFERENCE_NONE);
}
}
static void test_oomd_system_context_acquire(void) {
@ -287,21 +323,35 @@ static void test_oomd_sort_cgroups(void) {
char **paths = STRV_MAKE("/herp.slice",
"/herp.slice/derp.scope",
"/herp.slice/derp.scope/sheep.service",
"/zupa.slice");
"/zupa.slice",
"/omitted.slice",
"/avoid.slice");
OomdCGroupContext ctx[4] = {
OomdCGroupContext ctx[6] = {
{ .path = paths[0],
.swap_usage = 20,
.pgscan = 60 },
.pgscan = 60,
.current_memory_usage = 10 },
{ .path = paths[1],
.swap_usage = 60,
.pgscan = 40 },
.pgscan = 40,
.current_memory_usage = 20 },
{ .path = paths[2],
.swap_usage = 40,
.pgscan = 20 },
.pgscan = 40,
.current_memory_usage = 40 },
{ .path = paths[3],
.swap_usage = 10,
.pgscan = 80 },
.pgscan = 80,
.current_memory_usage = 10 },
{ .path = paths[4],
.swap_usage = 90,
.pgscan = 100,
.preference = MANAGED_OOM_PREFERENCE_OMIT },
{ .path = paths[5],
.swap_usage = 99,
.pgscan = 200,
.preference = MANAGED_OOM_PREFERENCE_AVOID },
};
assert_se(h = hashmap_new(&string_hash_ops));
@ -310,26 +360,32 @@ static void test_oomd_sort_cgroups(void) {
assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0);
assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0);
assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4);
assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5);
assert_se(sorted_cgroups[0] == &ctx[1]);
assert_se(sorted_cgroups[1] == &ctx[2]);
assert_se(sorted_cgroups[2] == &ctx[0]);
assert_se(sorted_cgroups[3] == &ctx[3]);
assert_se(sorted_cgroups[4] == &ctx[5]);
sorted_cgroups = mfree(sorted_cgroups);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, NULL, &sorted_cgroups) == 4);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5);
assert_se(sorted_cgroups[0] == &ctx[3]);
assert_se(sorted_cgroups[1] == &ctx[0]);
assert_se(sorted_cgroups[2] == &ctx[1]);
assert_se(sorted_cgroups[3] == &ctx[2]);
assert_se(sorted_cgroups[2] == &ctx[2]);
assert_se(sorted_cgroups[3] == &ctx[1]);
assert_se(sorted_cgroups[4] == &ctx[5]);
sorted_cgroups = mfree(sorted_cgroups);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
assert_se(sorted_cgroups[0] == &ctx[1]);
assert_se(sorted_cgroups[1] == &ctx[2]);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
assert_se(sorted_cgroups[0] == &ctx[2]);
assert_se(sorted_cgroups[1] == &ctx[1]);
assert_se(sorted_cgroups[2] == 0);
assert_se(sorted_cgroups[3] == 0);
assert_se(sorted_cgroups[4] == 0);
assert_se(sorted_cgroups[5] == 0);
sorted_cgroups = mfree(sorted_cgroups);
}

View File

@ -435,7 +435,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons
if (STR_IN_SET(field, "DevicePolicy",
"Slice",
"ManagedOOMSwap",
"ManagedOOMMemoryPressure"))
"ManagedOOMMemoryPressure",
"ManagedOOMPreference"))
return bus_append_string(m, field, eq);
if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) {

View File

@ -73,6 +73,7 @@ int main(int argc, char **argv) {
test_table(log_target, LOG_TARGET);
test_table(mac_address_policy, MAC_ADDRESS_POLICY);
test_table(managed_oom_mode, MANAGED_OOM_MODE);
test_table(managed_oom_preference, MANAGED_OOM_PREFERENCE);
test_table(manager_state, MANAGER_STATE);
test_table(manager_timestamp, MANAGER_TIMESTAMP);
test_table(mount_exec_command, MOUNT_EXEC_COMMAND);

View File

@ -138,6 +138,10 @@ MakeDirectory=
Mark=
MaxConnections=
MaxConnectionsPerSource=
ManagedOOMSwap=
ManagedOOMMemoryPressure=
ManagedOOMMemoryPressureLimitPercent=
ManagedOOMPreference=
MemoryAccounting=
MemoryHigh=
MemoryLimit=

View File

@ -124,6 +124,7 @@ BASICTOOLS=(
rmdir
sed
seq
setfattr
setfont
setsid
sfdisk

View File

@ -0,0 +1,7 @@
[Unit]
Description=Create some memory pressure
[Service]
MemoryHigh=2M
Slice=testsuite-56-workload.slice
ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh

View File

@ -13,6 +13,8 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]
fi
[[ -e /skipped ]] && exit 0 || true
rm -rf /etc/systemd/system/testsuite-56-testbloat.service.d
echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf
systemctl start testsuite-56-testchill.service
@ -23,20 +25,47 @@ oomctl | grep "/testsuite-56-workload.slice"
oomctl | grep "1.00%"
oomctl | grep "Default Memory Pressure Duration: 5s"
# systemd-oomd watches for elevated pressure for 30 seconds before acting.
# It can take time to build up pressure so either wait 5 minutes or for the service to fail.
timeout=$(date -ud "5 minutes" +%s)
# systemd-oomd watches for elevated pressure for 5 seconds before acting.
# It can take time to build up pressure so either wait 2 minutes or for the service to fail.
timeout=$(date -ud "2 minutes" +%s)
while [[ $(date -u +%s) -le $timeout ]]; do
if ! systemctl status testsuite-56-testbloat.service; then
break
fi
sleep 15
sleep 5
done
# testbloat should be killed and testchill should be fine
if systemctl status testsuite-56-testbloat.service; then exit 42; fi
if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
# only run this portion of the test if we can set xattrs
if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then
sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down
mkdir -p /etc/systemd/system/testsuite-56-testbloat.service.d/
echo "[Service]" > /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf
echo "ManagedOOMPreference=avoid" >> /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf
systemctl daemon-reload
systemctl start testsuite-56-testchill.service
systemctl start testsuite-56-testmunch.service
systemctl start testsuite-56-testbloat.service
timeout=$(date -ud "2 minutes" +%s)
while [[ $(date -u +%s) -le $timeout ]]; do
if ! systemctl status testsuite-56-testmunch.service; then
break
fi
sleep 5
done
# testmunch should be killed since testbloat had the avoid xattr on it
if ! systemctl status testsuite-56-testbloat.service; then exit 25; fi
if systemctl status testsuite-56-testmunch.service; then exit 43; fi
if ! systemctl status testsuite-56-testchill.service; then exit 24; fi
fi
systemd-analyze log-level info
echo OK > /testok