mirror of
https://github.com/systemd/systemd.git
synced 2024-12-25 01:34:28 +03:00
Merge pull request #19126 from anitazha/oomdimprovements
systemd-oomd post-test week improvements
This commit is contained in:
commit
9d5ae3a121
@ -52,9 +52,9 @@
|
||||
|
||||
<listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command>
|
||||
will take action. If the fraction of swap used on the system is more than what is defined here,
|
||||
<command>systemd-oomd</command> will act on eligible descendant control groups, starting from the
|
||||
ones with the highest swap usage to the lowest swap usage. Which control groups are monitored and
|
||||
what action gets taken depends on what the unit has configured for
|
||||
<command>systemd-oomd</command> will act on eligible descendant control groups with swap usage greater
|
||||
than 5% of total swap, starting from the ones with the highest swap usage. Which
|
||||
control groups are monitored and what action gets taken depends on what the unit has configured for
|
||||
<varname>ManagedOOMSwap=</varname>. Takes a value specified in percent (when suffixed with "%"),
|
||||
permille ("‰") or permyriad ("‱"), between 0% and 100%, inclusive. Defaults to 90%.</para></listitem>
|
||||
</varlistentry>
|
||||
@ -81,7 +81,7 @@
|
||||
<listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure
|
||||
limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by
|
||||
<varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>.
|
||||
Defaults to 30 seconds when this property is unset or set to 0.</para></listitem>
|
||||
Must be set to 0, or at least 1 second. Defaults to 30 seconds when unset or 0.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
|
@ -299,8 +299,7 @@ static int acquire_managed_oom_connect(Manager *m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
|
||||
_cleanup_set_free_ Set *targets = NULL;
|
||||
static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
|
||||
Manager *m = userdata;
|
||||
usec_t usec_now;
|
||||
int r;
|
||||
@ -313,7 +312,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to reset event timer: %m");
|
||||
|
||||
r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
|
||||
r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to set relative time for timer: %m");
|
||||
|
||||
@ -324,13 +323,98 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
|
||||
return log_error_errno(r, "Failed to acquire varlink connection: %m");
|
||||
}
|
||||
|
||||
/* Update the cgroups used for detection/action */
|
||||
r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m");
|
||||
/* We still try to acquire swap information for oomctl even if no units want swap monitoring */
|
||||
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
|
||||
/* If there are no units depending on swap actions, the only error we exit on is ENOMEM.
|
||||
* Allow ENOENT in the event that swap is disabled on the system. */
|
||||
if (r == -ENOENT) {
|
||||
zero(m->system_context);
|
||||
return 0;
|
||||
} else if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
|
||||
return log_error_errno(r, "Failed to acquire system context: %m");
|
||||
|
||||
/* Return early if nothing is requesting swap monitoring */
|
||||
if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
|
||||
return 0;
|
||||
|
||||
/* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
|
||||
* system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
|
||||
* is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
|
||||
* nodes are the ones that matter). */
|
||||
|
||||
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
|
||||
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
|
||||
_cleanup_free_ char *selected = NULL;
|
||||
uint64_t threshold;
|
||||
|
||||
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
|
||||
m->system_context.swap_used, m->system_context.swap_total,
|
||||
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
|
||||
|
||||
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
|
||||
|
||||
threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
|
||||
r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
|
||||
else {
|
||||
if (selected)
|
||||
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
|
||||
PERMYRIAD_AS_PERCENT_FORMAT_STR,
|
||||
selected, m->system_context.swap_used, m->system_context.swap_total,
|
||||
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void clear_candidate_hashmapp(Manager **m) {
|
||||
if (*m)
|
||||
hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
|
||||
}
|
||||
|
||||
static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
|
||||
/* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
|
||||
* update the candidate data (in which case clear_candidates will be NULL). */
|
||||
_cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
|
||||
_cleanup_set_free_ Set *targets = NULL;
|
||||
bool in_post_action_delay = false;
|
||||
Manager *m = userdata;
|
||||
usec_t usec_now;
|
||||
int r;
|
||||
|
||||
assert(s);
|
||||
assert(userdata);
|
||||
|
||||
/* Reset timer */
|
||||
r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to reset event timer: %m");
|
||||
|
||||
r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to set relative time for timer: %m");
|
||||
|
||||
/* Reconnect if our connection dropped */
|
||||
if (!m->varlink) {
|
||||
r = acquire_managed_oom_connect(m);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to acquire varlink connection: %m");
|
||||
}
|
||||
|
||||
/* Return early if nothing is requesting memory pressure monitoring */
|
||||
if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
|
||||
return 0;
|
||||
|
||||
/* Update the cgroups used for detection/action */
|
||||
r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
@ -344,23 +428,13 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
|
||||
|
||||
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
|
||||
/* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
|
||||
* Allow ENOENT in the event that swap is disabled on the system. */
|
||||
if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
|
||||
return log_error_errno(r, "Failed to acquire system context: %m");
|
||||
else if (r == -ENOENT)
|
||||
zero(m->system_context);
|
||||
|
||||
if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
|
||||
m->last_reclaim_at = usec_now;
|
||||
|
||||
/* If we're still recovering from a kill, don't try to kill again yet */
|
||||
if (m->post_action_delay_start > 0) {
|
||||
if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
|
||||
return 0;
|
||||
/* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
|
||||
* values and go on a kill storm. */
|
||||
if (m->mem_pressure_post_action_delay_start > 0) {
|
||||
if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
|
||||
in_post_action_delay = true;
|
||||
else
|
||||
m->post_action_delay_start = 0;
|
||||
m->mem_pressure_post_action_delay_start = 0;
|
||||
}
|
||||
|
||||
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
|
||||
@ -368,91 +442,92 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
|
||||
else if (r == 1) {
|
||||
/* Check if there was reclaim activity in the given interval. The concern is the following case:
|
||||
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
|
||||
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
|
||||
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
|
||||
* to kill something (it won't help anyways). */
|
||||
if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
|
||||
OomdCGroupContext *t;
|
||||
else if (r == 1 && !in_post_action_delay) {
|
||||
OomdCGroupContext *t;
|
||||
SET_FOREACH(t, targets) {
|
||||
_cleanup_free_ char *selected = NULL;
|
||||
char ts[FORMAT_TIMESPAN_MAX];
|
||||
|
||||
SET_FOREACH(t, targets) {
|
||||
_cleanup_free_ char *selected = NULL;
|
||||
char ts[FORMAT_TIMESPAN_MAX];
|
||||
/* Check if there was reclaim activity in the given interval. The concern is the following case:
|
||||
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
|
||||
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
|
||||
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
|
||||
* to kill something (it won't help anyways). */
|
||||
if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
|
||||
continue;
|
||||
|
||||
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
|
||||
t->path,
|
||||
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
|
||||
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
|
||||
format_timespan(ts, sizeof ts,
|
||||
m->default_mem_pressure_duration_usec,
|
||||
USEC_PER_SEC));
|
||||
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
|
||||
t->path,
|
||||
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
|
||||
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
|
||||
format_timespan(ts, sizeof ts,
|
||||
m->default_mem_pressure_duration_usec,
|
||||
USEC_PER_SEC));
|
||||
|
||||
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
|
||||
else {
|
||||
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
|
||||
m->post_action_delay_start = usec_now;
|
||||
if (selected)
|
||||
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
|
||||
" for > %s with reclaim activity",
|
||||
selected, t->path,
|
||||
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
|
||||
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
|
||||
format_timespan(ts, sizeof ts,
|
||||
m->default_mem_pressure_duration_usec,
|
||||
USEC_PER_SEC));
|
||||
return 0;
|
||||
}
|
||||
r = update_monitored_cgroup_contexts_candidates(
|
||||
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
|
||||
else
|
||||
clear_candidates = NULL;
|
||||
|
||||
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
|
||||
else {
|
||||
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
|
||||
m->mem_pressure_post_action_delay_start = usec_now;
|
||||
if (selected)
|
||||
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
|
||||
" for > %s with reclaim activity",
|
||||
selected, t->path,
|
||||
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
|
||||
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
|
||||
format_timespan(ts, sizeof ts,
|
||||
m->default_mem_pressure_duration_usec,
|
||||
USEC_PER_SEC));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
|
||||
* monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
|
||||
* might happen.
|
||||
* Candidate cgroup data will continue to get updated during the post-action delay period in case
|
||||
* pressure continues to be high after a kill. */
|
||||
OomdCGroupContext *c;
|
||||
HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
|
||||
if (c->mem_pressure_limit_hit_start == 0)
|
||||
continue;
|
||||
|
||||
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
|
||||
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
|
||||
_cleanup_free_ char *selected = NULL;
|
||||
|
||||
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
|
||||
m->system_context.swap_used, m->system_context.swap_total,
|
||||
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
|
||||
|
||||
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
|
||||
|
||||
r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
|
||||
else {
|
||||
m->post_action_delay_start = usec_now;
|
||||
if (selected)
|
||||
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
|
||||
PERMYRIAD_AS_PERCENT_FORMAT_STR,
|
||||
selected, m->system_context.swap_used, m->system_context.swap_total,
|
||||
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
|
||||
return 0;
|
||||
r = update_monitored_cgroup_contexts_candidates(
|
||||
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
|
||||
else {
|
||||
clear_candidates = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int monitor_cgroup_contexts(Manager *m) {
|
||||
static int monitor_swap_contexts(Manager *m) {
|
||||
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
|
||||
int r;
|
||||
|
||||
assert(m);
|
||||
assert(m->event);
|
||||
|
||||
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m);
|
||||
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -464,9 +539,34 @@ static int monitor_cgroup_contexts(Manager *m) {
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
(void) sd_event_source_set_description(s, "oomd-timer");
|
||||
(void) sd_event_source_set_description(s, "oomd-swap-timer");
|
||||
|
||||
m->cgroup_context_event_source = TAKE_PTR(s);
|
||||
m->swap_context_event_source = TAKE_PTR(s);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int monitor_memory_pressure_contexts(Manager *m) {
|
||||
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
|
||||
int r;
|
||||
|
||||
assert(m);
|
||||
assert(m->event);
|
||||
|
||||
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = sd_event_source_set_exit_on_failure(s, true);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = sd_event_source_set_enabled(s, SD_EVENT_ON);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
(void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
|
||||
|
||||
m->mem_pressure_context_event_source = TAKE_PTR(s);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -474,7 +574,8 @@ Manager* manager_free(Manager *m) {
|
||||
assert(m);
|
||||
|
||||
varlink_close_unref(m->varlink);
|
||||
sd_event_source_unref(m->cgroup_context_event_source);
|
||||
sd_event_source_unref(m->swap_context_event_source);
|
||||
sd_event_source_unref(m->mem_pressure_context_event_source);
|
||||
sd_event_unref(m->event);
|
||||
|
||||
bus_verify_polkit_async_registry_free(m->polkit_registry);
|
||||
@ -596,7 +697,11 @@ int manager_start(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = monitor_cgroup_contexts(m);
|
||||
r = monitor_memory_pressure_contexts(m);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = monitor_swap_contexts(m);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
|
@ -7,10 +7,9 @@
|
||||
#include "varlink.h"
|
||||
|
||||
/* Polling interval for monitoring stats */
|
||||
#define INTERVAL_USEC (1 * USEC_PER_SEC)
|
||||
|
||||
/* Used to weight the averages */
|
||||
#define AVERAGE_SIZE_DECAY 4
|
||||
#define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
|
||||
/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
|
||||
#define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
|
||||
|
||||
/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the
|
||||
* percentage of time all tasks were delayed (i.e. unproductive).
|
||||
@ -20,6 +19,9 @@
|
||||
#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60
|
||||
#define DEFAULT_SWAP_USED_LIMIT_PERCENT 90
|
||||
|
||||
/* Only tackle candidates with large swap usage. */
|
||||
#define THRESHOLD_SWAP_USED_PERCENT 5
|
||||
|
||||
#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
|
||||
#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
|
||||
|
||||
@ -44,10 +46,10 @@ struct Manager {
|
||||
|
||||
OomdSystemContext system_context;
|
||||
|
||||
usec_t last_reclaim_at;
|
||||
usec_t post_action_delay_start;
|
||||
usec_t mem_pressure_post_action_delay_start;
|
||||
|
||||
sd_event_source *cgroup_context_event_source;
|
||||
sd_event_source *swap_context_event_source;
|
||||
sd_event_source *mem_pressure_context_event_source;
|
||||
|
||||
Varlink *varlink;
|
||||
};
|
||||
|
@ -82,17 +82,17 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
|
||||
if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
|
||||
usec_t diff;
|
||||
|
||||
if (ctx->last_hit_mem_pressure_limit == 0)
|
||||
ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC);
|
||||
if (ctx->mem_pressure_limit_hit_start == 0)
|
||||
ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC);
|
||||
|
||||
diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit;
|
||||
diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start;
|
||||
if (diff >= duration) {
|
||||
r = set_put(targets, ctx);
|
||||
if (r < 0)
|
||||
return -ENOMEM;
|
||||
}
|
||||
} else
|
||||
ctx->last_hit_mem_pressure_limit = 0;
|
||||
ctx->mem_pressure_limit_hit_start = 0;
|
||||
}
|
||||
|
||||
if (!set_isempty(targets)) {
|
||||
@ -104,34 +104,21 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool oomd_memory_reclaim(Hashmap *h) {
|
||||
uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0;
|
||||
OomdCGroupContext *ctx;
|
||||
uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
|
||||
uint64_t last_pgscan;
|
||||
|
||||
assert(h);
|
||||
assert(c);
|
||||
|
||||
/* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
|
||||
* there was reclaim activity. Used along with pressure checks to decide whether to take action. */
|
||||
|
||||
HASHMAP_FOREACH(ctx, h) {
|
||||
uint64_t sum;
|
||||
|
||||
sum = pgscan + ctx->pgscan;
|
||||
if (sum < pgscan || sum < ctx->pgscan)
|
||||
pgscan_of++; /* count overflows */
|
||||
pgscan = sum;
|
||||
|
||||
sum = last_pgscan + ctx->last_pgscan;
|
||||
if (sum < last_pgscan || sum < ctx->last_pgscan)
|
||||
last_pgscan_of++; /* count overflows */
|
||||
last_pgscan = sum;
|
||||
/* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero.
|
||||
* pgscan is monotonic and in practice should not decrease (except in the recreation case). */
|
||||
last_pgscan = c->last_pgscan;
|
||||
if (c->last_pgscan > c->pgscan) {
|
||||
log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.",
|
||||
c->last_pgscan, c->pgscan, c->path);
|
||||
last_pgscan = 0;
|
||||
}
|
||||
|
||||
/* overflow counts are the same, return sums comparison */
|
||||
if (last_pgscan_of == pgscan_of)
|
||||
return pgscan > last_pgscan;
|
||||
|
||||
return pgscan_of > last_pgscan_of;
|
||||
return c->pgscan - last_pgscan;
|
||||
}
|
||||
|
||||
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
|
||||
@ -246,7 +233,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char
|
||||
return ret;
|
||||
}
|
||||
|
||||
int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
|
||||
int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
|
||||
_cleanup_free_ OomdCGroupContext **sorted = NULL;
|
||||
int n, r, ret = 0;
|
||||
|
||||
@ -257,12 +244,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
|
||||
if (n < 0)
|
||||
return n;
|
||||
|
||||
/* Try to kill cgroups with non-zero swap usage until we either succeed in
|
||||
* killing or we get to a cgroup with no swap usage. */
|
||||
/* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with
|
||||
* no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
|
||||
for (int i = 0; i < n; i++) {
|
||||
/* Skip over cgroups with no resource usage.
|
||||
* Continue break since there might be "avoid" cgroups at the end. */
|
||||
if (sorted[i]->swap_usage == 0)
|
||||
/* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
|
||||
* cgroups at the end. */
|
||||
if (sorted[i]->swap_usage <= threshold_usage)
|
||||
continue;
|
||||
|
||||
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
|
||||
@ -430,9 +417,13 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
|
||||
if (old_ctx) {
|
||||
curr_ctx->last_pgscan = old_ctx->pgscan;
|
||||
curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
|
||||
curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
|
||||
curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
|
||||
curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
|
||||
}
|
||||
|
||||
if (oomd_pgscan_rate(curr_ctx) > 0)
|
||||
curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
|
||||
|
||||
r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -456,7 +447,11 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
|
||||
|
||||
ctx->last_pgscan = old_ctx->pgscan;
|
||||
ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
|
||||
ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
|
||||
ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
|
||||
ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
|
||||
|
||||
if (oomd_pgscan_rate(ctx) > 0)
|
||||
ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -32,10 +32,10 @@ struct OomdCGroupContext {
|
||||
|
||||
ManagedOOMPreference preference;
|
||||
|
||||
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
|
||||
/* These are only used for acting on high memory pressure. */
|
||||
loadavg_t mem_pressure_limit;
|
||||
usec_t mem_pressure_duration_usec;
|
||||
usec_t last_hit_mem_pressure_limit;
|
||||
usec_t mem_pressure_limit_hit_start;
|
||||
usec_t last_had_mem_reclaim;
|
||||
};
|
||||
|
||||
struct OomdSystemContext {
|
||||
@ -51,23 +51,22 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
|
||||
|
||||
/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret`
|
||||
* if any of them have exceeded their supplied memory pressure limits for the `duration` length of time.
|
||||
* `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns
|
||||
* `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns
|
||||
* below the limit.
|
||||
* Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`.
|
||||
* Returns -ENOMEM for allocation errors. */
|
||||
int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
|
||||
|
||||
/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the
|
||||
* current sum is higher than the last interval's sum (there was some reclaim activity). */
|
||||
bool oomd_memory_reclaim(Hashmap *h);
|
||||
|
||||
/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
|
||||
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);
|
||||
|
||||
/* Returns pgscan - last_pgscan, accounting for corner cases. */
|
||||
uint64_t oomd_pgscan_rate(const OomdCGroupContext *c);
|
||||
|
||||
/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
|
||||
* (after the smallest values). */
|
||||
static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
|
||||
uint64_t last1, last2;
|
||||
uint64_t diff1, diff2;
|
||||
int r;
|
||||
|
||||
assert(c1);
|
||||
@ -77,22 +76,9 @@ static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const
|
||||
if (r != 0)
|
||||
return r;
|
||||
|
||||
/* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. */
|
||||
last2 = (*c2)->last_pgscan;
|
||||
if ((*c2)->last_pgscan > (*c2)->pgscan) {
|
||||
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
|
||||
(*c2)->last_pgscan, (*c2)->pgscan, (*c2)->path);
|
||||
last2 = 0;
|
||||
}
|
||||
|
||||
last1 = (*c1)->last_pgscan;
|
||||
if ((*c1)->last_pgscan > (*c1)->pgscan) {
|
||||
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
|
||||
(*c1)->last_pgscan, (*c1)->pgscan, (*c1)->path);
|
||||
last1 = 0;
|
||||
}
|
||||
|
||||
r = CMP((*c2)->pgscan - last2, (*c1)->pgscan - last1);
|
||||
diff1 = oomd_pgscan_rate(*c1);
|
||||
diff2 = oomd_pgscan_rate(*c2);
|
||||
r = CMP(diff2, diff1);
|
||||
if (r != 0)
|
||||
return r;
|
||||
|
||||
@ -125,7 +111,7 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
|
||||
* everything in `h` is a candidate.
|
||||
* Returns the killed cgroup in ret_selected. */
|
||||
int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected);
|
||||
int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected);
|
||||
int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected);
|
||||
|
||||
int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
|
||||
int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret);
|
||||
|
@ -155,6 +155,9 @@ static int run(int argc, char *argv[]) {
|
||||
|
||||
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
|
||||
|
||||
if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC)
|
||||
log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s");
|
||||
|
||||
r = manager_new(&m);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create manager: %m");
|
||||
|
@ -160,9 +160,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
|
||||
assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST);
|
||||
|
||||
/* make sure certain values from h1 get updated in h2 */
|
||||
c1->pgscan = 5555;
|
||||
c1->pgscan = UINT64_MAX;
|
||||
c1->mem_pressure_limit = 6789;
|
||||
c1->last_hit_mem_pressure_limit = 42;
|
||||
c1->mem_pressure_limit_hit_start = 42;
|
||||
c1->last_had_mem_reclaim = 888;
|
||||
assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
|
||||
assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
|
||||
c1 = hashmap_get(h1, cgroup);
|
||||
@ -170,9 +171,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
|
||||
assert_se(c1);
|
||||
assert_se(c2);
|
||||
assert_se(c1 != c2);
|
||||
assert_se(c2->last_pgscan == 5555);
|
||||
assert_se(c2->last_pgscan == UINT64_MAX);
|
||||
assert_se(c2->mem_pressure_limit == 6789);
|
||||
assert_se(c2->last_hit_mem_pressure_limit == 42);
|
||||
assert_se(c2->mem_pressure_limit_hit_start == 42);
|
||||
assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
|
||||
|
||||
/* Assert that avoid/omit are not set if the cgroup is not owned by root */
|
||||
if (test_xattrs) {
|
||||
@ -189,20 +191,22 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
|
||||
char **paths = STRV_MAKE("/0.slice",
|
||||
"/1.slice");
|
||||
|
||||
OomdCGroupContext ctx_old[3] = {
|
||||
OomdCGroupContext ctx_old[2] = {
|
||||
{ .path = paths[0],
|
||||
.mem_pressure_limit = 5,
|
||||
.last_hit_mem_pressure_limit = 777,
|
||||
.mem_pressure_limit_hit_start = 777,
|
||||
.last_had_mem_reclaim = 888,
|
||||
.pgscan = 57 },
|
||||
{ .path = paths[1],
|
||||
.mem_pressure_limit = 6,
|
||||
.last_hit_mem_pressure_limit = 888,
|
||||
.mem_pressure_limit_hit_start = 888,
|
||||
.last_had_mem_reclaim = 888,
|
||||
.pgscan = 42 },
|
||||
};
|
||||
|
||||
OomdCGroupContext ctx_new[3] = {
|
||||
OomdCGroupContext ctx_new[2] = {
|
||||
{ .path = paths[0],
|
||||
.pgscan = 100 },
|
||||
.pgscan = 57 },
|
||||
{ .path = paths[1],
|
||||
.pgscan = 101 },
|
||||
};
|
||||
@ -221,13 +225,15 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
|
||||
assert_se(c_new = hashmap_get(h_new, "/0.slice"));
|
||||
assert_se(c_old->pgscan == c_new->last_pgscan);
|
||||
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
|
||||
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
|
||||
assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
|
||||
assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
|
||||
|
||||
assert_se(c_old = hashmap_get(h_old, "/1.slice"));
|
||||
assert_se(c_new = hashmap_get(h_new, "/1.slice"));
|
||||
assert_se(c_old->pgscan == c_new->last_pgscan);
|
||||
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
|
||||
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
|
||||
assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
|
||||
assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
|
||||
}
|
||||
|
||||
static void test_oomd_system_context_acquire(void) {
|
||||
@ -290,7 +296,7 @@ static void test_oomd_pressure_above(void) {
|
||||
assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1);
|
||||
assert_se(set_contains(t1, &ctx[0]) == true);
|
||||
assert_se(c = hashmap_get(h1, "/herp.slice"));
|
||||
assert_se(c->last_hit_mem_pressure_limit > 0);
|
||||
assert_se(c->mem_pressure_limit_hit_start > 0);
|
||||
|
||||
/* Low memory pressure */
|
||||
assert_se(h2 = hashmap_new(&string_hash_ops));
|
||||
@ -298,7 +304,7 @@ static void test_oomd_pressure_above(void) {
|
||||
assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0);
|
||||
assert_se(t2 == NULL);
|
||||
assert_se(c = hashmap_get(h2, "/derp.slice"));
|
||||
assert_se(c->last_hit_mem_pressure_limit == 0);
|
||||
assert_se(c->mem_pressure_limit_hit_start == 0);
|
||||
|
||||
/* High memory pressure w/ multiple cgroups */
|
||||
assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0);
|
||||
@ -306,50 +312,9 @@ static void test_oomd_pressure_above(void) {
|
||||
assert_se(set_contains(t3, &ctx[0]) == true);
|
||||
assert_se(set_size(t3) == 1);
|
||||
assert_se(c = hashmap_get(h1, "/herp.slice"));
|
||||
assert_se(c->last_hit_mem_pressure_limit > 0);
|
||||
assert_se(c->mem_pressure_limit_hit_start > 0);
|
||||
assert_se(c = hashmap_get(h1, "/derp.slice"));
|
||||
assert_se(c->last_hit_mem_pressure_limit == 0);
|
||||
}
|
||||
|
||||
static void test_oomd_memory_reclaim(void) {
|
||||
_cleanup_hashmap_free_ Hashmap *h1 = NULL;
|
||||
char **paths = STRV_MAKE("/0.slice",
|
||||
"/1.slice",
|
||||
"/2.slice",
|
||||
"/3.slice",
|
||||
"/4.slice");
|
||||
|
||||
OomdCGroupContext ctx[5] = {
|
||||
{ .path = paths[0],
|
||||
.last_pgscan = 100,
|
||||
.pgscan = 100 },
|
||||
{ .path = paths[1],
|
||||
.last_pgscan = 100,
|
||||
.pgscan = 100 },
|
||||
{ .path = paths[2],
|
||||
.last_pgscan = 77,
|
||||
.pgscan = 33 },
|
||||
{ .path = paths[3],
|
||||
.last_pgscan = UINT64_MAX,
|
||||
.pgscan = 100 },
|
||||
{ .path = paths[4],
|
||||
.last_pgscan = 100,
|
||||
.pgscan = UINT64_MAX },
|
||||
};
|
||||
|
||||
assert_se(h1 = hashmap_new(&string_hash_ops));
|
||||
assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0);
|
||||
assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0);
|
||||
assert_se(oomd_memory_reclaim(h1) == false);
|
||||
|
||||
assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0);
|
||||
assert_se(oomd_memory_reclaim(h1) == false);
|
||||
|
||||
assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0);
|
||||
assert_se(oomd_memory_reclaim(h1) == true);
|
||||
|
||||
assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0);
|
||||
assert_se(oomd_memory_reclaim(h1) == false);
|
||||
assert_se(c->mem_pressure_limit_hit_start == 0);
|
||||
}
|
||||
|
||||
static void test_oomd_swap_free_below(void) {
|
||||
@ -468,7 +433,6 @@ int main(void) {
|
||||
test_oomd_update_cgroup_contexts_between_hashmaps();
|
||||
test_oomd_system_context_acquire();
|
||||
test_oomd_pressure_above();
|
||||
test_oomd_memory_reclaim();
|
||||
test_oomd_swap_free_below();
|
||||
test_oomd_sort_cgroups();
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user