1
0
mirror of https://github.com/systemd/systemd.git synced 2024-12-25 01:34:28 +03:00

Merge pull request #19126 from anitazha/oomdimprovements

systemd-oomd post-test week improvements
This commit is contained in:
Zbigniew Jędrzejewski-Szmek 2021-04-06 07:59:59 +02:00 committed by GitHub
commit 9d5ae3a121
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 281 additions and 226 deletions

View File

@ -52,9 +52,9 @@
<listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command> <listitem><para>Sets the limit for swap usage on the system before <command>systemd-oomd</command>
will take action. If the fraction of swap used on the system is more than what is defined here, will take action. If the fraction of swap used on the system is more than what is defined here,
<command>systemd-oomd</command> will act on eligible descendant control groups, starting from the <command>systemd-oomd</command> will act on eligible descendant control groups with swap usage greater
ones with the highest swap usage to the lowest swap usage. Which control groups are monitored and than 5% of total swap, starting from the ones with the highest swap usage. Which
what action gets taken depends on what the unit has configured for control groups are monitored and what action gets taken depends on what the unit has configured for
<varname>ManagedOOMSwap=</varname>. Takes a value specified in percent (when suffixed with "%"), <varname>ManagedOOMSwap=</varname>. Takes a value specified in percent (when suffixed with "%"),
permille ("‰") or permyriad ("‱"), between 0% and 100%, inclusive. Defaults to 90%.</para></listitem> permille ("‰") or permyriad ("‱"), between 0% and 100%, inclusive. Defaults to 90%.</para></listitem>
</varlistentry> </varlistentry>
@ -81,7 +81,7 @@
<listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure <listitem><para>Sets the amount of time a unit's control group needs to have exceeded memory pressure
limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by limits before <command>systemd-oomd</command> will take action. Memory pressure limits are defined by
<varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>. <varname>DefaultMemoryPressureLimit=</varname> and <varname>ManagedOOMMemoryPressureLimit=</varname>.
Defaults to 30 seconds when this property is unset or set to 0.</para></listitem> Must be set to 0, or at least 1 second. Defaults to 30 seconds when unset or 0.</para></listitem>
</varlistentry> </varlistentry>
</variablelist> </variablelist>

View File

@ -299,8 +299,7 @@ static int acquire_managed_oom_connect(Manager *m) {
return 0; return 0;
} }
static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) { static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
_cleanup_set_free_ Set *targets = NULL;
Manager *m = userdata; Manager *m = userdata;
usec_t usec_now; usec_t usec_now;
int r; int r;
@ -313,7 +312,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
if (r < 0) if (r < 0)
return log_error_errno(r, "Failed to reset event timer: %m"); return log_error_errno(r, "Failed to reset event timer: %m");
r = sd_event_source_set_time_relative(s, INTERVAL_USEC); r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
if (r < 0) if (r < 0)
return log_error_errno(r, "Failed to set relative time for timer: %m"); return log_error_errno(r, "Failed to set relative time for timer: %m");
@ -324,13 +323,98 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
return log_error_errno(r, "Failed to acquire varlink connection: %m"); return log_error_errno(r, "Failed to acquire varlink connection: %m");
} }
/* Update the cgroups used for detection/action */ /* We still try to acquire swap information for oomctl even if no units want swap monitoring */
r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts); r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
if (r == -ENOMEM) /* If there are no units depending on swap actions, the only error we exit on is ENOMEM.
return log_oom(); * Allow ENOENT in the event that swap is disabled on the system. */
if (r < 0) if (r == -ENOENT) {
log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m"); zero(m->system_context);
return 0;
} else if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
return log_error_errno(r, "Failed to acquire system context: %m");
/* Return early if nothing is requesting swap monitoring */
if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
return 0;
/* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
* system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
* is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
* nodes are the ones that matter). */
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
_cleanup_free_ char *selected = NULL;
uint64_t threshold;
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
else {
if (selected)
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
PERMYRIAD_AS_PERCENT_FORMAT_STR,
selected, m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
return 0;
}
}
return 0;
}
static void clear_candidate_hashmapp(Manager **m) {
if (*m)
hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
}
static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
/* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
* update the candidate data (in which case clear_candidates will be NULL). */
_cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
_cleanup_set_free_ Set *targets = NULL;
bool in_post_action_delay = false;
Manager *m = userdata;
usec_t usec_now;
int r;
assert(s);
assert(userdata);
/* Reset timer */
r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
if (r < 0)
return log_error_errno(r, "Failed to reset event timer: %m");
r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
if (r < 0)
return log_error_errno(r, "Failed to set relative time for timer: %m");
/* Reconnect if our connection dropped */
if (!m->varlink) {
r = acquire_managed_oom_connect(m);
if (r < 0)
return log_error_errno(r, "Failed to acquire varlink connection: %m");
}
/* Return early if nothing is requesting memory pressure monitoring */
if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
return 0;
/* Update the cgroups used for detection/action */
r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts); r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
if (r == -ENOMEM) if (r == -ENOMEM)
return log_oom(); return log_oom();
@ -344,23 +428,13 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
if (r < 0) if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m"); log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
r = oomd_system_context_acquire("/proc/swaps", &m->system_context); /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
/* If there aren't units depending on swap actions, the only error we exit on is ENOMEM. * values and go on a kill storm. */
* Allow ENOENT in the event that swap is disabled on the system. */ if (m->mem_pressure_post_action_delay_start > 0) {
if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
return log_error_errno(r, "Failed to acquire system context: %m"); in_post_action_delay = true;
else if (r == -ENOENT)
zero(m->system_context);
if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
m->last_reclaim_at = usec_now;
/* If we're still recovering from a kill, don't try to kill again yet */
if (m->post_action_delay_start > 0) {
if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
return 0;
else else
m->post_action_delay_start = 0; m->mem_pressure_post_action_delay_start = 0;
} }
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
@ -368,91 +442,92 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
return log_oom(); return log_oom();
if (r < 0) if (r < 0)
log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m"); log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
else if (r == 1) { else if (r == 1 && !in_post_action_delay) {
/* Check if there was reclaim activity in the given interval. The concern is the following case: OomdCGroupContext *t;
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending SET_FOREACH(t, targets) {
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and _cleanup_free_ char *selected = NULL;
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need char ts[FORMAT_TIMESPAN_MAX];
* to kill something (it won't help anyways). */
if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
OomdCGroupContext *t;
SET_FOREACH(t, targets) { /* Check if there was reclaim activity in the given interval. The concern is the following case:
_cleanup_free_ char *selected = NULL; * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
char ts[FORMAT_TIMESPAN_MAX]; * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
* to kill something (it won't help anyways). */
if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
continue;
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity", log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
t->path, t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10), LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit), LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts, format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec, m->default_mem_pressure_duration_usec,
USEC_PER_SEC)); USEC_PER_SEC));
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected); r = update_monitored_cgroup_contexts_candidates(
if (r == -ENOMEM) m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
return log_oom(); if (r == -ENOMEM)
if (r < 0) return log_oom();
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path); if (r < 0)
else { log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */ else
m->post_action_delay_start = usec_now; clear_candidates = NULL;
if (selected)
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%" r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
" for > %s with reclaim activity", if (r == -ENOMEM)
selected, t->path, return log_oom();
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10), if (r < 0)
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit), log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
format_timespan(ts, sizeof ts, else {
m->default_mem_pressure_duration_usec, /* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
USEC_PER_SEC)); m->mem_pressure_post_action_delay_start = usec_now;
return 0; if (selected)
} log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
" for > %s with reclaim activity",
selected, t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
return 0;
} }
} }
} } else {
/* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
* monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
* might happen.
* Candidate cgroup data will continue to get updated during the post-action delay period in case
* pressure continues to be high after a kill. */
OomdCGroupContext *c;
HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
if (c->mem_pressure_limit_hit_start == 0)
continue;
if (oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) { r = update_monitored_cgroup_contexts_candidates(
_cleanup_hashmap_free_ Hashmap *candidates = NULL; m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
_cleanup_free_ char *selected = NULL; if (r == -ENOMEM)
return log_oom();
log_debug("Swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR, if (r < 0)
m->system_context.swap_used, m->system_context.swap_total, log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad)); else {
clear_candidates = NULL;
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates); break;
if (r == -ENOMEM) }
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
r = oomd_kill_by_swap_usage(candidates, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
else {
m->post_action_delay_start = usec_now;
if (selected)
log_notice("Killed %s due to swap used (%"PRIu64") / total (%"PRIu64") being more than "
PERMYRIAD_AS_PERCENT_FORMAT_STR,
selected, m->system_context.swap_used, m->system_context.swap_total,
PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
return 0;
} }
} }
return 0; return 0;
} }
static int monitor_cgroup_contexts(Manager *m) { static int monitor_swap_contexts(Manager *m) {
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL; _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
int r; int r;
assert(m); assert(m);
assert(m->event); assert(m->event);
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_cgroup_contexts_handler, m); r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
if (r < 0) if (r < 0)
return r; return r;
@ -464,9 +539,34 @@ static int monitor_cgroup_contexts(Manager *m) {
if (r < 0) if (r < 0)
return r; return r;
(void) sd_event_source_set_description(s, "oomd-timer"); (void) sd_event_source_set_description(s, "oomd-swap-timer");
m->cgroup_context_event_source = TAKE_PTR(s); m->swap_context_event_source = TAKE_PTR(s);
return 0;
}
static int monitor_memory_pressure_contexts(Manager *m) {
_cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
int r;
assert(m);
assert(m->event);
r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
if (r < 0)
return r;
r = sd_event_source_set_exit_on_failure(s, true);
if (r < 0)
return r;
r = sd_event_source_set_enabled(s, SD_EVENT_ON);
if (r < 0)
return r;
(void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
m->mem_pressure_context_event_source = TAKE_PTR(s);
return 0; return 0;
} }
@ -474,7 +574,8 @@ Manager* manager_free(Manager *m) {
assert(m); assert(m);
varlink_close_unref(m->varlink); varlink_close_unref(m->varlink);
sd_event_source_unref(m->cgroup_context_event_source); sd_event_source_unref(m->swap_context_event_source);
sd_event_source_unref(m->mem_pressure_context_event_source);
sd_event_unref(m->event); sd_event_unref(m->event);
bus_verify_polkit_async_registry_free(m->polkit_registry); bus_verify_polkit_async_registry_free(m->polkit_registry);
@ -596,7 +697,11 @@ int manager_start(
if (r < 0) if (r < 0)
return r; return r;
r = monitor_cgroup_contexts(m); r = monitor_memory_pressure_contexts(m);
if (r < 0)
return r;
r = monitor_swap_contexts(m);
if (r < 0) if (r < 0)
return r; return r;

View File

@ -7,10 +7,9 @@
#include "varlink.h" #include "varlink.h"
/* Polling interval for monitoring stats */ /* Polling interval for monitoring stats */
#define INTERVAL_USEC (1 * USEC_PER_SEC) #define SWAP_INTERVAL_USEC 150000 /* 0.15 seconds */
/* Pressure counters are lagging (~2 seconds) compared to swap so polling too frequently just wastes CPU */
/* Used to weight the averages */ #define MEM_PRESSURE_INTERVAL_USEC (1 * USEC_PER_SEC)
#define AVERAGE_SIZE_DECAY 4
/* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the /* Take action if 10s of memory pressure > 60 for more than 30s. We use the "full" value from PSI so this is the
* percentage of time all tasks were delayed (i.e. unproductive). * percentage of time all tasks were delayed (i.e. unproductive).
@ -20,6 +19,9 @@
#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60 #define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60
#define DEFAULT_SWAP_USED_LIMIT_PERCENT 90 #define DEFAULT_SWAP_USED_LIMIT_PERCENT 90
/* Only tackle candidates with large swap usage. */
#define THRESHOLD_SWAP_USED_PERCENT 5
#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) #define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
@ -44,10 +46,10 @@ struct Manager {
OomdSystemContext system_context; OomdSystemContext system_context;
usec_t last_reclaim_at; usec_t mem_pressure_post_action_delay_start;
usec_t post_action_delay_start;
sd_event_source *cgroup_context_event_source; sd_event_source *swap_context_event_source;
sd_event_source *mem_pressure_context_event_source;
Varlink *varlink; Varlink *varlink;
}; };

View File

@ -82,17 +82,17 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) {
usec_t diff; usec_t diff;
if (ctx->last_hit_mem_pressure_limit == 0) if (ctx->mem_pressure_limit_hit_start == 0)
ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC); ctx->mem_pressure_limit_hit_start = now(CLOCK_MONOTONIC);
diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit; diff = now(CLOCK_MONOTONIC) - ctx->mem_pressure_limit_hit_start;
if (diff >= duration) { if (diff >= duration) {
r = set_put(targets, ctx); r = set_put(targets, ctx);
if (r < 0) if (r < 0)
return -ENOMEM; return -ENOMEM;
} }
} else } else
ctx->last_hit_mem_pressure_limit = 0; ctx->mem_pressure_limit_hit_start = 0;
} }
if (!set_isempty(targets)) { if (!set_isempty(targets)) {
@ -104,34 +104,21 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
return 0; return 0;
} }
bool oomd_memory_reclaim(Hashmap *h) { uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0; uint64_t last_pgscan;
OomdCGroupContext *ctx;
assert(h); assert(c);
/* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values, /* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero.
* there was reclaim activity. Used along with pressure checks to decide whether to take action. */ * pgscan is monotonic and in practice should not decrease (except in the recreation case). */
last_pgscan = c->last_pgscan;
HASHMAP_FOREACH(ctx, h) { if (c->last_pgscan > c->pgscan) {
uint64_t sum; log_debug("Last pgscan %"PRIu64" greater than current pgscan %"PRIu64" for %s. Using last pgscan of zero.",
c->last_pgscan, c->pgscan, c->path);
sum = pgscan + ctx->pgscan; last_pgscan = 0;
if (sum < pgscan || sum < ctx->pgscan)
pgscan_of++; /* count overflows */
pgscan = sum;
sum = last_pgscan + ctx->last_pgscan;
if (sum < last_pgscan || sum < ctx->last_pgscan)
last_pgscan_of++; /* count overflows */
last_pgscan = sum;
} }
/* overflow counts are the same, return sums comparison */ return c->pgscan - last_pgscan;
if (last_pgscan_of == pgscan_of)
return pgscan > last_pgscan;
return pgscan_of > last_pgscan_of;
} }
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) { bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad) {
@ -246,7 +233,7 @@ int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char
return ret; return ret;
} }
int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) { int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected) {
_cleanup_free_ OomdCGroupContext **sorted = NULL; _cleanup_free_ OomdCGroupContext **sorted = NULL;
int n, r, ret = 0; int n, r, ret = 0;
@ -257,12 +244,12 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected) {
if (n < 0) if (n < 0)
return n; return n;
/* Try to kill cgroups with non-zero swap usage until we either succeed in /* Try to kill cgroups with non-zero swap usage until we either succeed in killing or we get to a cgroup with
* killing or we get to a cgroup with no swap usage. */ * no swap usage. Threshold killing only cgroups with more than threshold swap usage. */
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
/* Skip over cgroups with no resource usage. /* Skip over cgroups with not enough swap usage. Don't break since there might be "avoid"
* Continue break since there might be "avoid" cgroups at the end. */ * cgroups at the end. */
if (sorted[i]->swap_usage == 0) if (sorted[i]->swap_usage <= threshold_usage)
continue; continue;
r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); r = oomd_cgroup_kill(sorted[i]->path, true, dry_run);
@ -430,9 +417,13 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
if (old_ctx) { if (old_ctx) {
curr_ctx->last_pgscan = old_ctx->pgscan; curr_ctx->last_pgscan = old_ctx->pgscan;
curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; curr_ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
} }
if (oomd_pgscan_rate(curr_ctx) > 0)
curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
r = hashmap_put(new_h, curr_ctx->path, curr_ctx); r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
if (r < 0) if (r < 0)
return r; return r;
@ -456,7 +447,11 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
ctx->last_pgscan = old_ctx->pgscan; ctx->last_pgscan = old_ctx->pgscan;
ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; ctx->mem_pressure_limit_hit_start = old_ctx->mem_pressure_limit_hit_start;
ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
if (oomd_pgscan_rate(ctx) > 0)
ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
} }
} }

View File

@ -32,10 +32,10 @@ struct OomdCGroupContext {
ManagedOOMPreference preference; ManagedOOMPreference preference;
/* These are only used by oomd_pressure_above for acting on high memory pressure. */ /* These are only used for acting on high memory pressure. */
loadavg_t mem_pressure_limit; loadavg_t mem_pressure_limit;
usec_t mem_pressure_duration_usec; usec_t mem_pressure_limit_hit_start;
usec_t last_hit_mem_pressure_limit; usec_t last_had_mem_reclaim;
}; };
struct OomdSystemContext { struct OomdSystemContext {
@ -51,23 +51,22 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret` /* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret`
* if any of them have exceeded their supplied memory pressure limits for the `duration` length of time. * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time.
* `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns * `mem_pressure_limit_hit_start` is updated accordingly for the first time the limit is exceeded, and when it returns
* below the limit. * below the limit.
* Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`. * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`.
* Returns -ENOMEM for allocation errors. */ * Returns -ENOMEM for allocation errors. */
int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret); int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the
* current sum is higher than the last interval's sum (there was some reclaim activity). */
bool oomd_memory_reclaim(Hashmap *h);
/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */ /* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad); bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);
/* Returns pgscan - last_pgscan, accounting for corner cases. */
uint64_t oomd_pgscan_rate(const OomdCGroupContext *c);
/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end /* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
* (after the smallest values). */ * (after the smallest values). */
static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
uint64_t last1, last2; uint64_t diff1, diff2;
int r; int r;
assert(c1); assert(c1);
@ -77,22 +76,9 @@ static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const
if (r != 0) if (r != 0)
return r; return r;
/* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. */ diff1 = oomd_pgscan_rate(*c1);
last2 = (*c2)->last_pgscan; diff2 = oomd_pgscan_rate(*c2);
if ((*c2)->last_pgscan > (*c2)->pgscan) { r = CMP(diff2, diff1);
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
(*c2)->last_pgscan, (*c2)->pgscan, (*c2)->path);
last2 = 0;
}
last1 = (*c1)->last_pgscan;
if ((*c1)->last_pgscan > (*c1)->pgscan) {
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
(*c1)->last_pgscan, (*c1)->pgscan, (*c1)->path);
last1 = 0;
}
r = CMP((*c2)->pgscan - last2, (*c1)->pgscan - last1);
if (r != 0) if (r != 0)
return r; return r;
@ -125,7 +111,7 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
* everything in `h` is a candidate. * everything in `h` is a candidate.
* Returns the killed cgroup in ret_selected. */ * Returns the killed cgroup in ret_selected. */
int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected); int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run, char **ret_selected);
int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run, char **ret_selected); int oomd_kill_by_swap_usage(Hashmap *h, uint64_t threshold_usage, bool dry_run, char **ret_selected);
int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret); int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret);

View File

@ -155,6 +155,9 @@ static int run(int argc, char *argv[]) {
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0); assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGTERM, SIGINT, -1) >= 0);
if (arg_mem_pressure_usec > 0 && arg_mem_pressure_usec < 1 * USEC_PER_SEC)
log_error_errno(SYNTHETIC_ERRNO(EINVAL), "DefaultMemoryPressureDurationSec= must be 0 or at least 1s");
r = manager_new(&m); r = manager_new(&m);
if (r < 0) if (r < 0)
return log_error_errno(r, "Failed to create manager: %m"); return log_error_errno(r, "Failed to create manager: %m");

View File

@ -160,9 +160,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST); assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST);
/* make sure certain values from h1 get updated in h2 */ /* make sure certain values from h1 get updated in h2 */
c1->pgscan = 5555; c1->pgscan = UINT64_MAX;
c1->mem_pressure_limit = 6789; c1->mem_pressure_limit = 6789;
c1->last_hit_mem_pressure_limit = 42; c1->mem_pressure_limit_hit_start = 42;
c1->last_had_mem_reclaim = 888;
assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0); assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
c1 = hashmap_get(h1, cgroup); c1 = hashmap_get(h1, cgroup);
@ -170,9 +171,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(c1); assert_se(c1);
assert_se(c2); assert_se(c2);
assert_se(c1 != c2); assert_se(c1 != c2);
assert_se(c2->last_pgscan == 5555); assert_se(c2->last_pgscan == UINT64_MAX);
assert_se(c2->mem_pressure_limit == 6789); assert_se(c2->mem_pressure_limit == 6789);
assert_se(c2->last_hit_mem_pressure_limit == 42); assert_se(c2->mem_pressure_limit_hit_start == 42);
assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
/* Assert that avoid/omit are not set if the cgroup is not owned by root */ /* Assert that avoid/omit are not set if the cgroup is not owned by root */
if (test_xattrs) { if (test_xattrs) {
@ -189,20 +191,22 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
char **paths = STRV_MAKE("/0.slice", char **paths = STRV_MAKE("/0.slice",
"/1.slice"); "/1.slice");
OomdCGroupContext ctx_old[3] = { OomdCGroupContext ctx_old[2] = {
{ .path = paths[0], { .path = paths[0],
.mem_pressure_limit = 5, .mem_pressure_limit = 5,
.last_hit_mem_pressure_limit = 777, .mem_pressure_limit_hit_start = 777,
.last_had_mem_reclaim = 888,
.pgscan = 57 }, .pgscan = 57 },
{ .path = paths[1], { .path = paths[1],
.mem_pressure_limit = 6, .mem_pressure_limit = 6,
.last_hit_mem_pressure_limit = 888, .mem_pressure_limit_hit_start = 888,
.last_had_mem_reclaim = 888,
.pgscan = 42 }, .pgscan = 42 },
}; };
OomdCGroupContext ctx_new[3] = { OomdCGroupContext ctx_new[2] = {
{ .path = paths[0], { .path = paths[0],
.pgscan = 100 }, .pgscan = 57 },
{ .path = paths[1], { .path = paths[1],
.pgscan = 101 }, .pgscan = 101 },
}; };
@ -221,13 +225,15 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
assert_se(c_new = hashmap_get(h_new, "/0.slice")); assert_se(c_new = hashmap_get(h_new, "/0.slice"));
assert_se(c_old->pgscan == c_new->last_pgscan); assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit); assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
assert_se(c_old = hashmap_get(h_old, "/1.slice")); assert_se(c_old = hashmap_get(h_old, "/1.slice"));
assert_se(c_new = hashmap_get(h_new, "/1.slice")); assert_se(c_new = hashmap_get(h_new, "/1.slice"));
assert_se(c_old->pgscan == c_new->last_pgscan); assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit); assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit); assert_se(c_old->mem_pressure_limit_hit_start == c_new->mem_pressure_limit_hit_start);
assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
} }
static void test_oomd_system_context_acquire(void) { static void test_oomd_system_context_acquire(void) {
@ -290,7 +296,7 @@ static void test_oomd_pressure_above(void) {
assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1); assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1);
assert_se(set_contains(t1, &ctx[0]) == true); assert_se(set_contains(t1, &ctx[0]) == true);
assert_se(c = hashmap_get(h1, "/herp.slice")); assert_se(c = hashmap_get(h1, "/herp.slice"));
assert_se(c->last_hit_mem_pressure_limit > 0); assert_se(c->mem_pressure_limit_hit_start > 0);
/* Low memory pressure */ /* Low memory pressure */
assert_se(h2 = hashmap_new(&string_hash_ops)); assert_se(h2 = hashmap_new(&string_hash_ops));
@ -298,7 +304,7 @@ static void test_oomd_pressure_above(void) {
assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0); assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0);
assert_se(t2 == NULL); assert_se(t2 == NULL);
assert_se(c = hashmap_get(h2, "/derp.slice")); assert_se(c = hashmap_get(h2, "/derp.slice"));
assert_se(c->last_hit_mem_pressure_limit == 0); assert_se(c->mem_pressure_limit_hit_start == 0);
/* High memory pressure w/ multiple cgroups */ /* High memory pressure w/ multiple cgroups */
assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0); assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0);
@ -306,50 +312,9 @@ static void test_oomd_pressure_above(void) {
assert_se(set_contains(t3, &ctx[0]) == true); assert_se(set_contains(t3, &ctx[0]) == true);
assert_se(set_size(t3) == 1); assert_se(set_size(t3) == 1);
assert_se(c = hashmap_get(h1, "/herp.slice")); assert_se(c = hashmap_get(h1, "/herp.slice"));
assert_se(c->last_hit_mem_pressure_limit > 0); assert_se(c->mem_pressure_limit_hit_start > 0);
assert_se(c = hashmap_get(h1, "/derp.slice")); assert_se(c = hashmap_get(h1, "/derp.slice"));
assert_se(c->last_hit_mem_pressure_limit == 0); assert_se(c->mem_pressure_limit_hit_start == 0);
}
static void test_oomd_memory_reclaim(void) {
_cleanup_hashmap_free_ Hashmap *h1 = NULL;
char **paths = STRV_MAKE("/0.slice",
"/1.slice",
"/2.slice",
"/3.slice",
"/4.slice");
OomdCGroupContext ctx[5] = {
{ .path = paths[0],
.last_pgscan = 100,
.pgscan = 100 },
{ .path = paths[1],
.last_pgscan = 100,
.pgscan = 100 },
{ .path = paths[2],
.last_pgscan = 77,
.pgscan = 33 },
{ .path = paths[3],
.last_pgscan = UINT64_MAX,
.pgscan = 100 },
{ .path = paths[4],
.last_pgscan = 100,
.pgscan = UINT64_MAX },
};
assert_se(h1 = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0);
assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0);
assert_se(oomd_memory_reclaim(h1) == true);
assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
} }
static void test_oomd_swap_free_below(void) { static void test_oomd_swap_free_below(void) {
@ -468,7 +433,6 @@ int main(void) {
test_oomd_update_cgroup_contexts_between_hashmaps(); test_oomd_update_cgroup_contexts_between_hashmaps();
test_oomd_system_context_acquire(); test_oomd_system_context_acquire();
test_oomd_pressure_above(); test_oomd_pressure_above();
test_oomd_memory_reclaim();
test_oomd_swap_free_below(); test_oomd_swap_free_below();
test_oomd_sort_cgroups(); test_oomd_sort_cgroups();