1
0
mirror of https://github.com/systemd/systemd.git synced 2025-01-27 18:04:05 +03:00

oomd: rework memory reclaim detection logic

systemd-oomd only monitors and kills within a selected cgroup subtree
For memory pressure kills, this means it's unnecessary to get the
pgscan rate across all the monitored memory pressure cgroups.
The increase will show up whether we do a total sum or not, but since
we only care about the increase in the subtree we're about to target
for a kill, we can simplify the code a bit by not doing this total sum.
This commit is contained in:
Anita Zhang 2021-03-26 00:39:25 -07:00
parent 37d8020ccc
commit df637ede7b
5 changed files with 57 additions and 122 deletions

View File

@ -419,9 +419,6 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
m->last_reclaim_at = usec_now;
/* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
* values and go on a kill storm. */
if (m->mem_pressure_post_action_delay_start > 0) {
@ -437,45 +434,45 @@ static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t
if (r < 0)
log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
else if (r == 1) {
/* Check if there was reclaim activity in the given interval. The concern is the following case:
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
* to kill something (it won't help anyways). */
if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
OomdCGroupContext *t;
OomdCGroupContext *t;
SET_FOREACH(t, targets) {
_cleanup_free_ char *selected = NULL;
char ts[FORMAT_TIMESPAN_MAX];
SET_FOREACH(t, targets) {
_cleanup_free_ char *selected = NULL;
char ts[FORMAT_TIMESPAN_MAX];
/* Check if there was reclaim activity in the given interval. The concern is the following case:
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
* to kill something (it won't help anyways). */
if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
continue;
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
else {
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
m->mem_pressure_post_action_delay_start = usec_now;
if (selected)
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
" for > %s with reclaim activity",
selected, t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
return 0;
}
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
else {
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
m->mem_pressure_post_action_delay_start = usec_now;
if (selected)
log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
" for > %s with reclaim activity",
selected, t->path,
LOAD_INT(t->memory_pressure.avg10), LOAD_FRAC(t->memory_pressure.avg10),
LOAD_INT(t->mem_pressure_limit), LOAD_FRAC(t->mem_pressure_limit),
format_timespan(ts, sizeof ts,
m->default_mem_pressure_duration_usec,
USEC_PER_SEC));
return 0;
}
}
}

View File

@ -46,7 +46,6 @@ struct Manager {
OomdSystemContext system_context;
usec_t last_reclaim_at;
usec_t mem_pressure_post_action_delay_start;
sd_event_source *swap_context_event_source;

View File

@ -104,36 +104,6 @@ int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) {
return 0;
}
bool oomd_memory_reclaim(Hashmap *h) {
uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0;
OomdCGroupContext *ctx;
assert(h);
/* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values,
* there was reclaim activity. Used along with pressure checks to decide whether to take action. */
HASHMAP_FOREACH(ctx, h) {
uint64_t sum;
sum = pgscan + ctx->pgscan;
if (sum < pgscan || sum < ctx->pgscan)
pgscan_of++; /* count overflows */
pgscan = sum;
sum = last_pgscan + ctx->last_pgscan;
if (sum < last_pgscan || sum < ctx->last_pgscan)
last_pgscan_of++; /* count overflows */
last_pgscan = sum;
}
/* overflow counts are the same, return sums comparison */
if (last_pgscan_of == pgscan_of)
return pgscan > last_pgscan;
return pgscan_of > last_pgscan_of;
}
uint64_t oomd_pgscan_rate(const OomdCGroupContext *c) {
uint64_t last_pgscan;
@ -448,8 +418,12 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
curr_ctx->last_pgscan = old_ctx->pgscan;
curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
curr_ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
}
if (oomd_pgscan_rate(curr_ctx) > 0)
curr_ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
r = hashmap_put(new_h, curr_ctx->path, curr_ctx);
if (r < 0)
return r;
@ -474,6 +448,10 @@ void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_
ctx->last_pgscan = old_ctx->pgscan;
ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
ctx->last_had_mem_reclaim = old_ctx->last_had_mem_reclaim;
if (oomd_pgscan_rate(ctx) > 0)
ctx->last_had_mem_reclaim = now(CLOCK_MONOTONIC);
}
}

View File

@ -32,10 +32,11 @@ struct OomdCGroupContext {
ManagedOOMPreference preference;
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
/* These are only used for acting on high memory pressure. */
loadavg_t mem_pressure_limit;
usec_t mem_pressure_duration_usec;
usec_t last_hit_mem_pressure_limit;
usec_t last_had_mem_reclaim;
};
struct OomdSystemContext {
@ -57,10 +58,6 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free);
* Returns -ENOMEM for allocation errors. */
int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret);
/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the
* current sum is higher than the last interval's sum (there was some reclaim activity). */
bool oomd_memory_reclaim(Hashmap *h);
/* Returns true if the amount of swap free is below the permyriad of swap specified by `threshold_permyriad`. */
bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad);

View File

@ -153,9 +153,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == -EEXIST);
/* make sure certain values from h1 get updated in h2 */
c1->pgscan = 5555;
c1->pgscan = UINT64_MAX;
c1->mem_pressure_limit = 6789;
c1->last_hit_mem_pressure_limit = 42;
c1->last_had_mem_reclaim = 888;
assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops));
assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0);
c1 = hashmap_get(h1, cgroup);
@ -163,9 +164,10 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
assert_se(c1);
assert_se(c2);
assert_se(c1 != c2);
assert_se(c2->last_pgscan == 5555);
assert_se(c2->last_pgscan == UINT64_MAX);
assert_se(c2->mem_pressure_limit == 6789);
assert_se(c2->last_hit_mem_pressure_limit == 42);
assert_se(c2->last_had_mem_reclaim == 888); /* assumes the live pgscan is less than UINT64_MAX */
/* Assert that avoid/omit are not set if the cgroup is not owned by root */
if (test_xattrs) {
@ -182,20 +184,22 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
char **paths = STRV_MAKE("/0.slice",
"/1.slice");
OomdCGroupContext ctx_old[3] = {
OomdCGroupContext ctx_old[2] = {
{ .path = paths[0],
.mem_pressure_limit = 5,
.last_hit_mem_pressure_limit = 777,
.last_had_mem_reclaim = 888,
.pgscan = 57 },
{ .path = paths[1],
.mem_pressure_limit = 6,
.last_hit_mem_pressure_limit = 888,
.last_had_mem_reclaim = 888,
.pgscan = 42 },
};
OomdCGroupContext ctx_new[3] = {
OomdCGroupContext ctx_new[2] = {
{ .path = paths[0],
.pgscan = 100 },
.pgscan = 57 },
{ .path = paths[1],
.pgscan = 101 },
};
@ -215,12 +219,14 @@ static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
assert_se(c_old->last_had_mem_reclaim == c_new->last_had_mem_reclaim);
assert_se(c_old = hashmap_get(h_old, "/1.slice"));
assert_se(c_new = hashmap_get(h_new, "/1.slice"));
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
assert_se(c_new->last_had_mem_reclaim > c_old->last_had_mem_reclaim);
}
static void test_oomd_system_context_acquire(void) {
@ -304,47 +310,6 @@ static void test_oomd_pressure_above(void) {
assert_se(c->last_hit_mem_pressure_limit == 0);
}
static void test_oomd_memory_reclaim(void) {
_cleanup_hashmap_free_ Hashmap *h1 = NULL;
char **paths = STRV_MAKE("/0.slice",
"/1.slice",
"/2.slice",
"/3.slice",
"/4.slice");
OomdCGroupContext ctx[5] = {
{ .path = paths[0],
.last_pgscan = 100,
.pgscan = 100 },
{ .path = paths[1],
.last_pgscan = 100,
.pgscan = 100 },
{ .path = paths[2],
.last_pgscan = 77,
.pgscan = 33 },
{ .path = paths[3],
.last_pgscan = UINT64_MAX,
.pgscan = 100 },
{ .path = paths[4],
.last_pgscan = 100,
.pgscan = UINT64_MAX },
};
assert_se(h1 = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0);
assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0);
assert_se(oomd_memory_reclaim(h1) == true);
assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0);
assert_se(oomd_memory_reclaim(h1) == false);
}
static void test_oomd_swap_free_below(void) {
OomdSystemContext ctx = (OomdSystemContext) {
.swap_total = 20971512 * 1024U,
@ -461,7 +426,6 @@ int main(void) {
test_oomd_update_cgroup_contexts_between_hashmaps();
test_oomd_system_context_acquire();
test_oomd_pressure_above();
test_oomd_memory_reclaim();
test_oomd_swap_free_below();
test_oomd_sort_cgroups();