mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-03-08 20:58:20 +03:00
core: implement OOMPolicy= and watch cgroups for OOM killings
This adds a new per-service OOMPolicy= (along with a global DefaultOOMPolicy=) that controls what to do if a process of the service is killed by the kernel's OOM killer. It has three different values: "continue" (old behaviour), "stop" (terminate the service), "kill" (let the kernel kill all the service's processes). On top of that, track OOM killer events per unit: generate a per-unit structured, recognizable log message when we see an OOM killer event, and put the service in a failure state if an OOM killer event was seen and the selected policy was not "continue". A new "result" is defined for this case: "oom-kill". All of this relies on new cgroupv2 kernel functionality: the "memory.events" notification interface and the "memory.oom.group" attribute (which makes the kernel kill all cgroup processes automatically).
This commit is contained in:
parent
a5b5aece01
commit
afcfaa695c
@ -3,6 +3,8 @@
|
||||
#include <fcntl.h>
|
||||
#include <fnmatch.h>
|
||||
|
||||
#include "sd-messages.h"
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "blockdev-util.h"
|
||||
#include "bpf-devices.h"
|
||||
@ -1141,6 +1143,8 @@ static void cgroup_context_apply(
|
||||
cgroup_apply_unified_memory_limit(u, "memory.max", max);
|
||||
cgroup_apply_unified_memory_limit(u, "memory.swap.max", swap_max);
|
||||
|
||||
(void) set_attribute_and_warn(u, "memory", "memory.oom.group", one_zero(c->memory_oom_group));
|
||||
|
||||
} else {
|
||||
char buf[DECIMAL_STR_MAX(uint64_t) + 1];
|
||||
uint64_t val;
|
||||
@ -1640,6 +1644,69 @@ int unit_watch_cgroup(Unit *u) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unit_watch_cgroup_memory(Unit *u) {
|
||||
_cleanup_free_ char *events = NULL;
|
||||
CGroupContext *c;
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
|
||||
/* Watches the "memory.events" attribute of this unit's cgroup for "oom_kill" events, but only if
|
||||
* cgroupv2 is available. */
|
||||
|
||||
if (!u->cgroup_path)
|
||||
return 0;
|
||||
|
||||
c = unit_get_cgroup_context(u);
|
||||
if (!c)
|
||||
return 0;
|
||||
|
||||
/* The "memory.events" attribute is only available if the memory controller is on. Let's hence tie
|
||||
* this to memory accounting, in a way watching for OOM kills is a form of memory accounting after
|
||||
* all. */
|
||||
if (!c->memory_accounting)
|
||||
return 0;
|
||||
|
||||
/* Don't watch inner nodes, as the kernel doesn't report oom_kill events recursively currently, and
|
||||
* we also don't want to generate a log message for each parent cgroup of a process. */
|
||||
if (u->type == UNIT_SLICE)
|
||||
return 0;
|
||||
|
||||
if (u->cgroup_memory_inotify_wd >= 0)
|
||||
return 0;
|
||||
|
||||
/* Only applies to the unified hierarchy */
|
||||
r = cg_all_unified();
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to determine whether the memory controller is unified: %m");
|
||||
if (r == 0)
|
||||
return 0;
|
||||
|
||||
r = hashmap_ensure_allocated(&u->manager->cgroup_memory_inotify_wd_unit, &trivial_hash_ops);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
|
||||
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "memory.events", &events);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
|
||||
u->cgroup_memory_inotify_wd = inotify_add_watch(u->manager->cgroup_inotify_fd, events, IN_MODIFY);
|
||||
if (u->cgroup_memory_inotify_wd < 0) {
|
||||
|
||||
if (errno == ENOENT) /* If the directory is already gone we don't need to track it, so this
|
||||
* is not an error */
|
||||
return 0;
|
||||
|
||||
return log_unit_error_errno(u, errno, "Failed to add memory inotify watch descriptor for control group %s: %m", u->cgroup_path);
|
||||
}
|
||||
|
||||
r = hashmap_put(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd), u);
|
||||
if (r < 0)
|
||||
return log_unit_error_errno(u, r, "Failed to add memory inotify watch descriptor to hash map: %m");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int unit_pick_cgroup_path(Unit *u) {
|
||||
_cleanup_free_ char *path = NULL;
|
||||
int r;
|
||||
@ -1692,6 +1759,7 @@ static int unit_create_cgroup(
|
||||
|
||||
/* Start watching it */
|
||||
(void) unit_watch_cgroup(u);
|
||||
(void) unit_watch_cgroup_memory(u);
|
||||
|
||||
/* Preserve enabled controllers in delegated units, adjust others. */
|
||||
if (created || !u->cgroup_realized || !unit_cgroup_delegate(u)) {
|
||||
@ -2232,6 +2300,14 @@ void unit_release_cgroup(Unit *u) {
|
||||
(void) hashmap_remove(u->manager->cgroup_control_inotify_wd_unit, INT_TO_PTR(u->cgroup_control_inotify_wd));
|
||||
u->cgroup_control_inotify_wd = -1;
|
||||
}
|
||||
|
||||
if (u->cgroup_memory_inotify_wd >= 0) {
|
||||
if (inotify_rm_watch(u->manager->cgroup_inotify_fd, u->cgroup_memory_inotify_wd) < 0)
|
||||
log_unit_debug_errno(u, errno, "Failed to remove cgroup memory inotify watch %i for %s, ignoring: %m", u->cgroup_memory_inotify_wd, u->id);
|
||||
|
||||
(void) hashmap_remove(u->manager->cgroup_memory_inotify_wd_unit, INT_TO_PTR(u->cgroup_memory_inotify_wd));
|
||||
u->cgroup_memory_inotify_wd = -1;
|
||||
}
|
||||
}
|
||||
|
||||
void unit_prune_cgroup(Unit *u) {
|
||||
@ -2479,6 +2555,106 @@ void unit_add_to_cgroup_empty_queue(Unit *u) {
|
||||
log_debug_errno(r, "Failed to enable cgroup empty event source: %m");
|
||||
}
|
||||
|
||||
static int unit_check_oom(Unit *u) {
|
||||
_cleanup_free_ char *oom_kill = NULL;
|
||||
bool increased;
|
||||
uint64_t c;
|
||||
int r;
|
||||
|
||||
if (!u->cgroup_path)
|
||||
return 0;
|
||||
|
||||
r = cg_get_keyed_attribute("memory", u->cgroup_path, "memory.events", STRV_MAKE("oom_kill"), &oom_kill);
|
||||
if (r < 0)
|
||||
return log_unit_debug_errno(u, r, "Failed to read oom_kill field of memory.events cgroup attribute: %m");
|
||||
|
||||
r = safe_atou64(oom_kill, &c);
|
||||
if (r < 0)
|
||||
return log_unit_debug_errno(u, r, "Failed to parse oom_kill field: %m");
|
||||
|
||||
increased = c > u->oom_kill_last;
|
||||
u->oom_kill_last = c;
|
||||
|
||||
if (!increased)
|
||||
return 0;
|
||||
|
||||
log_struct(LOG_NOTICE,
|
||||
"MESSAGE_ID=" SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR,
|
||||
LOG_UNIT_ID(u),
|
||||
LOG_UNIT_INVOCATION_ID(u),
|
||||
LOG_UNIT_MESSAGE(u, "A process of this unit has been killed by the OOM killer."));
|
||||
|
||||
if (UNIT_VTABLE(u)->notify_cgroup_oom)
|
||||
UNIT_VTABLE(u)->notify_cgroup_oom(u);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int on_cgroup_oom_event(sd_event_source *s, void *userdata) {
|
||||
Manager *m = userdata;
|
||||
Unit *u;
|
||||
int r;
|
||||
|
||||
assert(s);
|
||||
assert(m);
|
||||
|
||||
u = m->cgroup_oom_queue;
|
||||
if (!u)
|
||||
return 0;
|
||||
|
||||
assert(u->in_cgroup_oom_queue);
|
||||
u->in_cgroup_oom_queue = false;
|
||||
LIST_REMOVE(cgroup_oom_queue, m->cgroup_oom_queue, u);
|
||||
|
||||
if (m->cgroup_oom_queue) {
|
||||
/* More stuff queued, let's make sure we remain enabled */
|
||||
r = sd_event_source_set_enabled(s, SD_EVENT_ONESHOT);
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to reenable cgroup oom event source, ignoring: %m");
|
||||
}
|
||||
|
||||
(void) unit_check_oom(u);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void unit_add_to_cgroup_oom_queue(Unit *u) {
|
||||
int r;
|
||||
|
||||
assert(u);
|
||||
|
||||
if (u->in_cgroup_oom_queue)
|
||||
return;
|
||||
if (!u->cgroup_path)
|
||||
return;
|
||||
|
||||
LIST_PREPEND(cgroup_oom_queue, u->manager->cgroup_oom_queue, u);
|
||||
u->in_cgroup_oom_queue = true;
|
||||
|
||||
/* Trigger the defer event */
|
||||
if (!u->manager->cgroup_oom_event_source) {
|
||||
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
|
||||
|
||||
r = sd_event_add_defer(u->manager->event, &s, on_cgroup_oom_event, u->manager);
|
||||
if (r < 0) {
|
||||
log_error_errno(r, "Failed to create cgroup oom event source: %m");
|
||||
return;
|
||||
}
|
||||
|
||||
r = sd_event_source_set_priority(s, SD_EVENT_PRIORITY_NORMAL-8);
|
||||
if (r < 0) {
|
||||
log_error_errno(r, "Failed to set priority of cgroup oom event source: %m");
|
||||
return;
|
||||
}
|
||||
|
||||
(void) sd_event_source_set_description(s, "cgroup-oom");
|
||||
u->manager->cgroup_oom_event_source = TAKE_PTR(s);
|
||||
}
|
||||
|
||||
r = sd_event_source_set_enabled(u->manager->cgroup_oom_event_source, SD_EVENT_ONESHOT);
|
||||
if (r < 0)
|
||||
log_error_errno(r, "Failed to enable cgroup oom event source: %m");
|
||||
}
|
||||
|
||||
static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
|
||||
Manager *m = userdata;
|
||||
|
||||
@ -2510,15 +2686,16 @@ static int on_cgroup_inotify_event(sd_event_source *s, int fd, uint32_t revents,
|
||||
/* The watch was just removed */
|
||||
continue;
|
||||
|
||||
u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
|
||||
if (!u) /* Not that inotify might deliver
|
||||
* events for a watch even after it
|
||||
* was removed, because it was queued
|
||||
* before the removal. Let's ignore
|
||||
* this here safely. */
|
||||
continue;
|
||||
/* Note that inotify might deliver events for a watch even after it was removed,
|
||||
* because it was queued before the removal. Let's ignore this here safely. */
|
||||
|
||||
unit_add_to_cgroup_empty_queue(u);
|
||||
u = hashmap_get(m->cgroup_control_inotify_wd_unit, INT_TO_PTR(e->wd));
|
||||
if (u)
|
||||
unit_add_to_cgroup_empty_queue(u);
|
||||
|
||||
u = hashmap_get(m->cgroup_memory_inotify_wd_unit, INT_TO_PTR(e->wd));
|
||||
if (u)
|
||||
unit_add_to_cgroup_oom_queue(u);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2709,6 +2886,7 @@ void manager_shutdown_cgroup(Manager *m, bool delete) {
|
||||
m->cgroup_empty_event_source = sd_event_source_unref(m->cgroup_empty_event_source);
|
||||
|
||||
m->cgroup_control_inotify_wd_unit = hashmap_free(m->cgroup_control_inotify_wd_unit);
|
||||
m->cgroup_memory_inotify_wd_unit = hashmap_free(m->cgroup_memory_inotify_wd_unit);
|
||||
|
||||
m->cgroup_inotify_event_source = sd_event_source_unref(m->cgroup_inotify_event_source);
|
||||
m->cgroup_inotify_fd = safe_close(m->cgroup_inotify_fd);
|
||||
|
@ -79,6 +79,9 @@ struct CGroupContext {
|
||||
bool tasks_accounting;
|
||||
bool ip_accounting;
|
||||
|
||||
/* Configures the memory.oom.group attribute (on unified) */
|
||||
bool memory_oom_group;
|
||||
|
||||
bool delegate;
|
||||
CGroupMask delegate_controllers;
|
||||
CGroupMask disable_controllers;
|
||||
@ -174,6 +177,7 @@ int unit_realize_cgroup(Unit *u);
|
||||
void unit_release_cgroup(Unit *u);
|
||||
void unit_prune_cgroup(Unit *u);
|
||||
int unit_watch_cgroup(Unit *u);
|
||||
int unit_watch_cgroup_memory(Unit *u);
|
||||
|
||||
void unit_add_to_cgroup_empty_queue(Unit *u);
|
||||
|
||||
|
@ -43,6 +43,8 @@ static UnitFileFlags unit_file_bools_to_flags(bool runtime, bool force) {
|
||||
(force ? UNIT_FILE_FORCE : 0);
|
||||
}
|
||||
|
||||
BUS_DEFINE_PROPERTY_GET_ENUM(bus_property_get_oom_policy, oom_policy, OOMPolicy);
|
||||
|
||||
static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_version, "s", GIT_VERSION);
|
||||
static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_features, "s", SYSTEMD_FEATURES);
|
||||
static BUS_DEFINE_PROPERTY_GET_GLOBAL(property_get_architecture, "s", architecture_to_string(uname_architecture()));
|
||||
@ -2452,6 +2454,7 @@ const sd_bus_vtable bus_manager_vtable[] = {
|
||||
SD_BUS_PROPERTY("DefaultLimitRTTIMESoft", "t", bus_property_get_rlimit, offsetof(Manager, rlimit[RLIMIT_RTTIME]), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("DefaultTasksMax", "t", NULL, offsetof(Manager, default_tasks_max), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("TimerSlackNSec", "t", property_get_timer_slack_nsec, 0, SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
SD_BUS_PROPERTY("DefaultOOMPolicy", "s", bus_property_get_oom_policy, offsetof(Manager, default_oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
|
||||
SD_BUS_METHOD("GetUnit", "s", "o", method_get_unit, SD_BUS_VTABLE_UNPRIVILEGED),
|
||||
SD_BUS_METHOD("GetUnitByPID", "u", "o", method_get_unit_by_pid, SD_BUS_VTABLE_UNPRIVILEGED),
|
||||
|
@ -12,3 +12,5 @@ void bus_manager_send_reloading(Manager *m, bool active);
|
||||
void bus_manager_send_change_signal(Manager *m);
|
||||
|
||||
int verify_run_space_and_log(const char *message);
|
||||
|
||||
int bus_property_get_oom_policy(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *ret_error);
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "dbus-cgroup.h"
|
||||
#include "dbus-execute.h"
|
||||
#include "dbus-kill.h"
|
||||
#include "dbus-manager.h"
|
||||
#include "dbus-service.h"
|
||||
#include "dbus-util.h"
|
||||
#include "exit-status.h"
|
||||
@ -127,6 +128,7 @@ const sd_bus_vtable bus_service_vtable[] = {
|
||||
SD_BUS_PROPERTY("UID", "u", bus_property_get_uid, offsetof(Unit, ref_uid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
|
||||
SD_BUS_PROPERTY("GID", "u", bus_property_get_gid, offsetof(Unit, ref_gid), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
|
||||
SD_BUS_PROPERTY("NRestarts", "u", bus_property_get_unsigned, offsetof(Service, n_restarts), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
|
||||
SD_BUS_PROPERTY("OOMPolicy", "s", bus_property_get_oom_policy, offsetof(Service, oom_policy), SD_BUS_VTABLE_PROPERTY_CONST),
|
||||
|
||||
BUS_EXEC_STATUS_VTABLE("ExecMain", offsetof(Service, main_exec_status), SD_BUS_VTABLE_PROPERTY_EMITS_CHANGE),
|
||||
BUS_EXEC_COMMAND_LIST_VTABLE("ExecStartPre", offsetof(Service, exec_command[SERVICE_EXEC_START_PRE]), SD_BUS_VTABLE_PROPERTY_EMITS_INVALIDATION),
|
||||
@ -257,6 +259,7 @@ static int bus_set_transient_std_fd(
|
||||
static BUS_DEFINE_SET_TRANSIENT_PARSE(notify_access, NotifyAccess, notify_access_from_string);
|
||||
static BUS_DEFINE_SET_TRANSIENT_PARSE(service_type, ServiceType, service_type_from_string);
|
||||
static BUS_DEFINE_SET_TRANSIENT_PARSE(service_restart, ServiceRestart, service_restart_from_string);
|
||||
static BUS_DEFINE_SET_TRANSIENT_PARSE(oom_policy, OOMPolicy, oom_policy_from_string);
|
||||
static BUS_DEFINE_SET_TRANSIENT_STRING_WITH_CHECK(bus_name, service_name_is_valid);
|
||||
|
||||
static int bus_service_set_transient_property(
|
||||
@ -291,6 +294,9 @@ static int bus_service_set_transient_property(
|
||||
if (streq(name, "Type"))
|
||||
return bus_set_transient_service_type(u, name, &s->type, message, flags, error);
|
||||
|
||||
if (streq(name, "OOMPolicy"))
|
||||
return bus_set_transient_oom_policy(u, name, &s->oom_policy, message, flags, error);
|
||||
|
||||
if (streq(name, "RestartUSec"))
|
||||
return bus_set_transient_usec(u, name, &s->restart_usec, message, flags, error);
|
||||
|
||||
|
@ -333,6 +333,7 @@ Service.Sockets, config_parse_service_sockets, 0,
|
||||
Service.BusPolicy, config_parse_warn_compat, DISABLED_LEGACY, 0
|
||||
Service.USBFunctionDescriptors, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_descriptors)
|
||||
Service.USBFunctionStrings, config_parse_unit_path_printf, 0, offsetof(Service, usb_function_strings)
|
||||
Service.OOMPolicy, config_parse_oom_policy, 0, offsetof(Service, oom_policy)
|
||||
EXEC_CONTEXT_CONFIG_ITEMS(Service)m4_dnl
|
||||
CGROUP_CONTEXT_CONFIG_ITEMS(Service)m4_dnl
|
||||
KILL_CONTEXT_CONFIG_ITEMS(Service)m4_dnl
|
||||
|
@ -86,6 +86,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_runtime_preserve_mode, exec_preserve_mode,
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_service_type, service_type, ServiceType, "Failed to parse service type");
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceRestart, "Failed to parse service restart specifier");
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value");
|
||||
DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy");
|
||||
DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value");
|
||||
DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight");
|
||||
DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight");
|
||||
|
@ -106,6 +106,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_pid_file);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_exit_status);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers);
|
||||
CONFIG_PARSER_PROTOTYPE(config_parse_oom_policy);
|
||||
|
||||
/* gperf prototypes */
|
||||
const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length);
|
||||
|
@ -135,6 +135,7 @@ static bool arg_default_tasks_accounting = true;
|
||||
static uint64_t arg_default_tasks_max = UINT64_MAX;
|
||||
static sd_id128_t arg_machine_id = {};
|
||||
static EmergencyAction arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
|
||||
static OOMPolicy arg_default_oom_policy = OOM_STOP;
|
||||
|
||||
_noreturn_ static void freeze_or_exit_or_reboot(void) {
|
||||
|
||||
@ -725,6 +726,7 @@ static int parse_config_file(void) {
|
||||
{ "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting },
|
||||
{ "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max },
|
||||
{ "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action },
|
||||
{ "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy },
|
||||
{}
|
||||
};
|
||||
|
||||
@ -780,6 +782,7 @@ static void set_manager_defaults(Manager *m) {
|
||||
m->default_memory_accounting = arg_default_memory_accounting;
|
||||
m->default_tasks_accounting = arg_default_tasks_accounting;
|
||||
m->default_tasks_max = arg_default_tasks_max;
|
||||
m->default_oom_policy = arg_default_oom_policy;
|
||||
|
||||
(void) manager_set_default_rlimits(m, arg_default_rlimit);
|
||||
|
||||
|
@ -764,6 +764,8 @@ int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager
|
||||
.have_ask_password = -EINVAL, /* we don't know */
|
||||
.first_boot = -1,
|
||||
.test_run_flags = test_run_flags,
|
||||
|
||||
.default_oom_policy = OOM_STOP,
|
||||
};
|
||||
|
||||
#if ENABLE_EFI
|
||||
@ -4714,3 +4716,11 @@ static const char *const manager_timestamp_table[_MANAGER_TIMESTAMP_MAX] = {
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP(manager_timestamp, ManagerTimestamp);
|
||||
|
||||
static const char* const oom_policy_table[_OOM_POLICY_MAX] = {
|
||||
[OOM_CONTINUE] = "continue",
|
||||
[OOM_STOP] = "stop",
|
||||
[OOM_KILL] = "kill",
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP(oom_policy, OOMPolicy);
|
||||
|
@ -56,6 +56,14 @@ typedef enum StatusType {
|
||||
STATUS_TYPE_EMERGENCY,
|
||||
} StatusType;
|
||||
|
||||
typedef enum OOMPolicy {
|
||||
OOM_CONTINUE, /* The kernel kills the process it wants to kill, and that's it */
|
||||
OOM_STOP, /* The kernel kills the process it wants to kill, and we stop the unit */
|
||||
OOM_KILL, /* The kernel kills the process it wants to kill, and all others in the unit, and we stop the unit */
|
||||
_OOM_POLICY_MAX,
|
||||
_OOM_POLICY_INVALID = -1
|
||||
} OOMPolicy;
|
||||
|
||||
/* Notes:
|
||||
* 1. TIMESTAMP_FIRMWARE, TIMESTAMP_LOADER, TIMESTAMP_KERNEL, TIMESTAMP_INITRD,
|
||||
* TIMESTAMP_SECURITY_START, and TIMESTAMP_SECURITY_FINISH are set only when
|
||||
@ -159,6 +167,9 @@ struct Manager {
|
||||
/* Units whose cgroup ran empty */
|
||||
LIST_HEAD(Unit, cgroup_empty_queue);
|
||||
|
||||
/* Units whose memory.event fired */
|
||||
LIST_HEAD(Unit, cgroup_oom_queue);
|
||||
|
||||
/* Target units whose default target dependencies haven't been set yet */
|
||||
LIST_HEAD(Unit, target_deps_queue);
|
||||
|
||||
@ -268,10 +279,15 @@ struct Manager {
|
||||
/* Notifications from cgroups, when the unified hierarchy is used is done via inotify. */
|
||||
int cgroup_inotify_fd;
|
||||
sd_event_source *cgroup_inotify_event_source;
|
||||
|
||||
/* Maps for finding the unit for each inotify watch descriptor for the cgroup.events and
|
||||
* memory.events cgroupv2 attributes. */
|
||||
Hashmap *cgroup_control_inotify_wd_unit;
|
||||
Hashmap *cgroup_memory_inotify_wd_unit;
|
||||
|
||||
/* A defer event for handling cgroup empty events and processing them after SIGCHLD in all cases. */
|
||||
sd_event_source *cgroup_empty_event_source;
|
||||
sd_event_source *cgroup_oom_event_source;
|
||||
|
||||
/* Make sure the user cannot accidentally unmount our cgroup
|
||||
* file system */
|
||||
@ -328,6 +344,8 @@ struct Manager {
|
||||
uint64_t default_tasks_max;
|
||||
usec_t default_timer_accuracy_usec;
|
||||
|
||||
OOMPolicy default_oom_policy;
|
||||
|
||||
int original_log_level;
|
||||
LogTarget original_log_target;
|
||||
bool log_level_overridden:1;
|
||||
@ -519,3 +537,6 @@ void manager_disable_confirm_spawn(void);
|
||||
const char *manager_timestamp_to_string(ManagerTimestamp m) _const_;
|
||||
ManagerTimestamp manager_timestamp_from_string(const char *s) _pure_;
|
||||
ManagerTimestamp manager_timestamp_initrd_mangle(ManagerTimestamp s);
|
||||
|
||||
const char* oom_policy_to_string(OOMPolicy i) _const_;
|
||||
OOMPolicy oom_policy_from_string(const char *s) _pure_;
|
||||
|
@ -112,6 +112,8 @@ static void service_init(Unit *u) {
|
||||
EXEC_KEYRING_PRIVATE : EXEC_KEYRING_INHERIT;
|
||||
|
||||
s->watchdog_original_usec = USEC_INFINITY;
|
||||
|
||||
s->oom_policy = _OOM_POLICY_INVALID;
|
||||
}
|
||||
|
||||
static void service_unwatch_control_pid(Service *s) {
|
||||
@ -731,6 +733,15 @@ static int service_add_extras(Service *s) {
|
||||
(s->type == SERVICE_NOTIFY || s->watchdog_usec > 0 || s->n_fd_store_max > 0))
|
||||
s->notify_access = NOTIFY_MAIN;
|
||||
|
||||
/* If no OOM policy was explicitly set, then default to the configure default OOM policy. Except when
|
||||
* delegation is on, in that case it we assume the payload knows better what to do and can process
|
||||
* things in a more focussed way. */
|
||||
if (s->oom_policy < 0)
|
||||
s->oom_policy = s->cgroup_context.delegate ? OOM_CONTINUE : UNIT(s)->manager->default_oom_policy;
|
||||
|
||||
/* Let the kernel do the killing if that's requested. */
|
||||
s->cgroup_context.memory_oom_group = s->oom_policy == OOM_KILL;
|
||||
|
||||
r = service_add_default_dependencies(s);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -799,7 +810,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) {
|
||||
"%sType: %s\n"
|
||||
"%sRestart: %s\n"
|
||||
"%sNotifyAccess: %s\n"
|
||||
"%sNotifyState: %s\n",
|
||||
"%sNotifyState: %s\n"
|
||||
"%sOOMPolicy: %s\n",
|
||||
prefix, service_state_to_string(s->state),
|
||||
prefix, service_result_to_string(s->result),
|
||||
prefix, service_result_to_string(s->reload_result),
|
||||
@ -810,7 +822,8 @@ static void service_dump(Unit *u, FILE *f, const char *prefix) {
|
||||
prefix, service_type_to_string(s->type),
|
||||
prefix, service_restart_to_string(s->restart),
|
||||
prefix, notify_access_to_string(s->notify_access),
|
||||
prefix, notify_state_to_string(s->notify_state));
|
||||
prefix, notify_state_to_string(s->notify_state),
|
||||
prefix, oom_policy_to_string(s->oom_policy));
|
||||
|
||||
if (s->control_pid > 0)
|
||||
fprintf(f,
|
||||
@ -3211,6 +3224,57 @@ static void service_notify_cgroup_empty_event(Unit *u) {
|
||||
}
|
||||
}
|
||||
|
||||
static void service_notify_cgroup_oom_event(Unit *u) {
|
||||
Service *s = SERVICE(u);
|
||||
|
||||
log_unit_debug(u, "Process of control group was killed by the OOM killer.");
|
||||
|
||||
if (s->oom_policy == OOM_CONTINUE)
|
||||
return;
|
||||
|
||||
switch (s->state) {
|
||||
|
||||
case SERVICE_START_PRE:
|
||||
case SERVICE_START:
|
||||
case SERVICE_START_POST:
|
||||
case SERVICE_STOP:
|
||||
if (s->oom_policy == OOM_STOP)
|
||||
service_enter_signal(s, SERVICE_STOP_SIGTERM, SERVICE_FAILURE_OOM_KILL);
|
||||
else if (s->oom_policy == OOM_KILL)
|
||||
service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
|
||||
|
||||
break;
|
||||
|
||||
case SERVICE_EXITED:
|
||||
case SERVICE_RUNNING:
|
||||
if (s->oom_policy == OOM_STOP)
|
||||
service_enter_stop(s, SERVICE_FAILURE_OOM_KILL);
|
||||
else if (s->oom_policy == OOM_KILL)
|
||||
service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
|
||||
|
||||
break;
|
||||
|
||||
case SERVICE_STOP_WATCHDOG:
|
||||
case SERVICE_STOP_SIGTERM:
|
||||
service_enter_signal(s, SERVICE_STOP_SIGKILL, SERVICE_FAILURE_OOM_KILL);
|
||||
break;
|
||||
|
||||
case SERVICE_STOP_SIGKILL:
|
||||
case SERVICE_FINAL_SIGKILL:
|
||||
if (s->result == SERVICE_SUCCESS)
|
||||
s->result = SERVICE_FAILURE_OOM_KILL;
|
||||
break;
|
||||
|
||||
case SERVICE_STOP_POST:
|
||||
case SERVICE_FINAL_SIGTERM:
|
||||
service_enter_signal(s, SERVICE_FINAL_SIGKILL, SERVICE_FAILURE_OOM_KILL);
|
||||
break;
|
||||
|
||||
default:
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
static void service_sigchld_event(Unit *u, pid_t pid, int code, int status) {
|
||||
bool notify_dbus = true;
|
||||
Service *s = SERVICE(u);
|
||||
@ -4116,6 +4180,7 @@ static const char* const service_result_table[_SERVICE_RESULT_MAX] = {
|
||||
[SERVICE_FAILURE_CORE_DUMP] = "core-dump",
|
||||
[SERVICE_FAILURE_WATCHDOG] = "watchdog",
|
||||
[SERVICE_FAILURE_START_LIMIT_HIT] = "start-limit-hit",
|
||||
[SERVICE_FAILURE_OOM_KILL] = "oom-kill",
|
||||
};
|
||||
|
||||
DEFINE_STRING_TABLE_LOOKUP(service_result, ServiceResult);
|
||||
@ -4169,6 +4234,7 @@ const UnitVTable service_vtable = {
|
||||
.reset_failed = service_reset_failed,
|
||||
|
||||
.notify_cgroup_empty = service_notify_cgroup_empty_event,
|
||||
.notify_cgroup_oom = service_notify_cgroup_oom_event,
|
||||
.notify_message = service_notify_message,
|
||||
|
||||
.main_pid = service_main_pid,
|
||||
|
@ -67,6 +67,7 @@ typedef enum ServiceResult {
|
||||
SERVICE_FAILURE_CORE_DUMP,
|
||||
SERVICE_FAILURE_WATCHDOG,
|
||||
SERVICE_FAILURE_START_LIMIT_HIT,
|
||||
SERVICE_FAILURE_OOM_KILL,
|
||||
_SERVICE_RESULT_MAX,
|
||||
_SERVICE_RESULT_INVALID = -1
|
||||
} ServiceResult;
|
||||
@ -184,6 +185,8 @@ struct Service {
|
||||
|
||||
unsigned n_restarts;
|
||||
bool flush_n_restarts;
|
||||
|
||||
OOMPolicy oom_policy;
|
||||
};
|
||||
|
||||
extern const UnitVTable service_vtable;
|
||||
|
@ -92,6 +92,7 @@ Unit *unit_new(Manager *m, size_t size) {
|
||||
u->unit_file_preset = -1;
|
||||
u->on_failure_job_mode = JOB_REPLACE;
|
||||
u->cgroup_control_inotify_wd = -1;
|
||||
u->cgroup_memory_inotify_wd = -1;
|
||||
u->job_timeout = USEC_INFINITY;
|
||||
u->job_running_timeout = USEC_INFINITY;
|
||||
u->ref_uid = UID_INVALID;
|
||||
@ -3245,6 +3246,9 @@ int unit_serialize(Unit *u, FILE *f, FDSet *fds, bool serialize_jobs) {
|
||||
if (u->cpu_usage_last != NSEC_INFINITY)
|
||||
(void) serialize_item_format(f, "cpu-usage-last", "%" PRIu64, u->cpu_usage_last);
|
||||
|
||||
if (u->oom_kill_last > 0)
|
||||
(void) serialize_item_format(f, "oom-kill-last", "%" PRIu64, u->oom_kill_last);
|
||||
|
||||
if (u->cgroup_path)
|
||||
(void) serialize_item(f, "cgroup", u->cgroup_path);
|
||||
|
||||
@ -3478,6 +3482,14 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
|
||||
|
||||
continue;
|
||||
|
||||
} else if (streq(l, "oom-kill-last")) {
|
||||
|
||||
r = safe_atou64(v, &u->oom_kill_last);
|
||||
if (r < 0)
|
||||
log_unit_debug(u, "Failed to read OOM kill last %s, ignoring.", v);
|
||||
|
||||
continue;
|
||||
|
||||
} else if (streq(l, "cgroup")) {
|
||||
|
||||
r = unit_set_cgroup_path(u, v);
|
||||
@ -3485,6 +3497,7 @@ int unit_deserialize(Unit *u, FILE *f, FDSet *fds) {
|
||||
log_unit_debug_errno(u, r, "Failed to set cgroup path %s, ignoring: %m", v);
|
||||
|
||||
(void) unit_watch_cgroup(u);
|
||||
(void) unit_watch_cgroup_memory(u);
|
||||
|
||||
continue;
|
||||
} else if (streq(l, "cgroup-realized")) {
|
||||
|
@ -200,6 +200,9 @@ typedef struct Unit {
|
||||
/* cgroup empty queue */
|
||||
LIST_FIELDS(Unit, cgroup_empty_queue);
|
||||
|
||||
/* cgroup OOM queue */
|
||||
LIST_FIELDS(Unit, cgroup_oom_queue);
|
||||
|
||||
/* Target dependencies queue */
|
||||
LIST_FIELDS(Unit, target_deps_queue);
|
||||
|
||||
@ -246,13 +249,19 @@ typedef struct Unit {
|
||||
nsec_t cpu_usage_base;
|
||||
nsec_t cpu_usage_last; /* the most recently read value */
|
||||
|
||||
/* The current counter of the oom_kill field in the memory.events cgroup attribute */
|
||||
uint64_t oom_kill_last;
|
||||
|
||||
/* Counterparts in the cgroup filesystem */
|
||||
char *cgroup_path;
|
||||
CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
|
||||
CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
|
||||
CGroupMask cgroup_invalidated_mask; /* A mask specifiying controllers which shall be considered invalidated, and require re-realization */
|
||||
CGroupMask cgroup_members_mask; /* A cache for the controllers required by all children of this cgroup (only relevant for slice units) */
|
||||
|
||||
/* Inotify watch descriptors for watching cgroup.events and memory.events on cgroupv2 */
|
||||
int cgroup_control_inotify_wd;
|
||||
int cgroup_memory_inotify_wd;
|
||||
|
||||
/* Device Controller BPF program */
|
||||
BPFProgram *bpf_device_control_installed;
|
||||
@ -320,6 +329,7 @@ typedef struct Unit {
|
||||
bool in_gc_queue:1;
|
||||
bool in_cgroup_realize_queue:1;
|
||||
bool in_cgroup_empty_queue:1;
|
||||
bool in_cgroup_oom_queue:1;
|
||||
bool in_target_deps_queue:1;
|
||||
bool in_stop_when_unneeded_queue:1;
|
||||
|
||||
@ -494,10 +504,12 @@ typedef struct UnitVTable {
|
||||
/* Reset failed state if we are in failed state */
|
||||
void (*reset_failed)(Unit *u);
|
||||
|
||||
/* Called whenever any of the cgroups this unit watches for
|
||||
* ran empty */
|
||||
/* Called whenever any of the cgroups this unit watches for ran empty */
|
||||
void (*notify_cgroup_empty)(Unit *u);
|
||||
|
||||
/* Called whenever an OOM kill event on this unit was seen */
|
||||
void (*notify_cgroup_oom)(Unit *u);
|
||||
|
||||
/* Called whenever a process of this unit sends us a message */
|
||||
void (*notify_message)(Unit *u, const struct ucred *ucred, char **tags, FDSet *fds);
|
||||
|
||||
|
@ -1314,7 +1314,7 @@ static int bus_append_service_property(sd_bus_message *m, const char *field, con
|
||||
|
||||
if (STR_IN_SET(field,
|
||||
"PIDFile", "Type", "Restart", "BusName", "NotifyAccess",
|
||||
"USBFunctionDescriptors", "USBFunctionStrings"))
|
||||
"USBFunctionDescriptors", "USBFunctionStrings", "OOMPolicy"))
|
||||
|
||||
return bus_append_string(m, field, eq);
|
||||
|
||||
|
@ -125,6 +125,9 @@ _SD_BEGIN_DECLARATIONS;
|
||||
#define SD_MESSAGE_OVERMOUNTING SD_ID128_MAKE(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7)
|
||||
#define SD_MESSAGE_OVERMOUNTING_STR SD_ID128_MAKE_STR(1d,ee,03,69,c7,fc,47,36,b7,09,9b,38,ec,b4,6e,e7)
|
||||
|
||||
#define SD_MESSAGE_UNIT_OUT_OF_MEMORY SD_ID128_MAKE(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef)
|
||||
#define SD_MESSAGE_UNIT_OUT_OF_MEMORY_STR SD_ID128_MAKE_STR(fe,6f,aa,94,e7,77,46,63,a0,da,52,71,78,91,d8,ef)
|
||||
|
||||
#define SD_MESSAGE_LID_OPENED SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f)
|
||||
#define SD_MESSAGE_LID_OPENED_STR SD_ID128_MAKE_STR(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,6f)
|
||||
#define SD_MESSAGE_LID_CLOSED SD_ID128_MAKE(b7,2e,a4,a2,88,15,45,a0,b5,0e,20,0e,55,b9,b0,70)
|
||||
|
Loading…
x
Reference in New Issue
Block a user