1
1
mirror of https://github.com/systemd/systemd-stable.git synced 2025-01-22 22:03:43 +03:00

Merge pull request #19011 from anitazha/pgscanrate

oomd: update pressure based kills to use pgscan rate
This commit is contained in:
Lennart Poettering 2021-03-17 23:26:49 +01:00 committed by GitHub
commit 22bf8ff8e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 204 additions and 48 deletions

View File

@ -117,6 +117,8 @@ static int process_managed_oom_reply(
r = ret;
goto finish;
}
if (ret < 0 && ret != -EEXIST)
log_debug_errno(ret, "Failed to insert reply, ignoring: %m");
/* Always update the limit in case it was changed. For non-memory pressure detection the value is
* ignored so always updating it here is not a problem. */
@ -156,7 +158,11 @@ static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
return r;
else if (r == 0) { /* No subgroups? We're a leaf node */
r = oomd_insert_cgroup_context(NULL, new_h, path);
return (r == -ENOMEM) ? r : 0;
if (r == -ENOMEM)
return r;
if (r < 0)
log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
return 0;
}
do {
@ -171,8 +177,12 @@ static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
/* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
if (r < 0)
return (r == -ENOMEM) ? r : 0;
if (r == -ENOMEM)
return r;
if (r < 0) {
log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path);
return 0;
}
if (oom_group)
r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
@ -180,6 +190,8 @@ static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
r = recursively_get_cgroup_context(new_h, cg_path);
if (r == -ENOMEM)
return r;
if (r < 0)
log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
} while ((r = cg_read_subgroup(d, &subpath)) > 0);
return 0;
@ -201,6 +213,8 @@ static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
if (r == -ENOMEM)
return r;
if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
}
hashmap_free(*monitored_cgroups);
@ -225,6 +239,8 @@ static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups,
r = recursively_get_cgroup_context(candidates, ctx->path);
if (r == -ENOMEM)
return r;
if (r < 0)
log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
}
*ret_candidates = TAKE_PTR(candidates);
@ -232,6 +248,26 @@ static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups,
return 0;
}
static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) {
_cleanup_hashmap_free_ Hashmap *new_candidates = NULL;
int r;
assert(monitored_cgroups);
assert(candidates);
assert(*candidates);
r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates);
if (r < 0)
return log_debug_errno(r, "Failed to get candidate contexts: %m");
oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates);
hashmap_free(*candidates);
*candidates = TAKE_PTR(new_candidates);
return 0;
}
static int acquire_managed_oom_connect(Manager *m) {
_cleanup_(varlink_close_unrefp) Varlink *link = NULL;
int r;
@ -275,33 +311,44 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
/* Reset timer */
r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
if (r < 0)
return log_error_errno(r, "Failed to reset event timer");
return log_error_errno(r, "Failed to reset event timer: %m");
r = sd_event_source_set_time_relative(s, INTERVAL_USEC);
if (r < 0)
return log_error_errno(r, "Failed to set relative time for timer");
return log_error_errno(r, "Failed to set relative time for timer: %m");
/* Reconnect if our connection dropped */
if (!m->varlink) {
r = acquire_managed_oom_connect(m);
if (r < 0)
return log_error_errno(r, "Failed to acquire varlink connection");
return log_error_errno(r, "Failed to acquire varlink connection: %m");
}
/* Update the cgroups used for detection/action */
r = update_monitored_cgroup_contexts(&m->monitored_swap_cgroup_contexts);
if (r == -ENOMEM)
return log_error_errno(r, "Failed to update monitored swap cgroup contexts");
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored swap cgroup contexts, ignoring: %m");
r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
if (r == -ENOMEM)
return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
r = update_monitored_cgroup_contexts_candidates(
m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
if (r == -ENOMEM)
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
/* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
* Allow ENOENT in the event that swap is disabled on the system. */
if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
return log_error_errno(r, "Failed to acquire system context");
return log_error_errno(r, "Failed to acquire system context: %m");
else if (r == -ENOENT)
zero(m->system_context);
@ -318,7 +365,9 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
if (r == -ENOMEM)
return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
else if (r == 1) {
/* Check if there was reclaim activity in the given interval. The concern is the following case:
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
@ -326,22 +375,17 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
* to kill something (it won't help anyways). */
if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
OomdCGroupContext *t;
r = get_monitored_cgroup_contexts_candidates(m->monitored_mem_pressure_cgroup_contexts, &candidates);
if (r == -ENOMEM)
return log_error_errno(r, "Failed to get monitored memory pressure cgroup candidates");
SET_FOREACH(t, targets) {
log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC);
r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run);
if (r == -ENOMEM)
return log_error_errno(r, "Failed to kill cgroup processes by pgscan");
return log_oom();
if (r < 0)
log_info("Failed to kill any cgroup(s) under %s based on pressure", t->path);
log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
else {
/* Don't act on all the high pressure cgroups at once; return as soon as we kill one */
m->post_action_delay_start = usec_now;
@ -359,13 +403,15 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
if (r == -ENOMEM)
return log_error_errno(r, "Failed to get monitored swap cgroup candidates");
return log_oom();
if (r < 0)
log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
r = oomd_kill_by_swap_usage(candidates, m->dry_run);
if (r == -ENOMEM)
return log_error_errno(r, "Failed to kill cgroup processes by swap usage");
return log_oom();
if (r < 0)
log_info("Failed to kill any cgroup(s) based on swap");
log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
else {
m->post_action_delay_start = usec_now;
return 0;
@ -412,6 +458,7 @@ Manager* manager_free(Manager *m) {
hashmap_free(m->monitored_swap_cgroup_contexts);
hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
return mfree(m);
}
@ -448,6 +495,10 @@ int manager_new(Manager **ret) {
if (!m->monitored_mem_pressure_cgroup_contexts)
return -ENOMEM;
m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
return -ENOMEM;
*ret = TAKE_PTR(m);
return 0;
}

View File

@ -40,6 +40,7 @@ struct Manager {
* Used to detect when to take action. */
Hashmap *monitored_swap_cgroup_contexts;
Hashmap *monitored_mem_pressure_cgroup_contexts;
Hashmap *monitored_mem_pressure_cgroup_contexts_candidates;
OomdSystemContext system_context;

View File

@ -208,13 +208,13 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) {
return set_size(pids_killed) != 0;
}
int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) {
int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run) {
_cleanup_free_ OomdCGroupContext **sorted = NULL;
int r;
assert(h);
r = oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, prefix, &sorted);
r = oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, prefix, &sorted);
if (r < 0)
return r;
@ -413,6 +413,25 @@ int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path)
return 0;
}
void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h) {
OomdCGroupContext *ctx;
assert(old_h);
assert(curr_h);
HASHMAP_FOREACH(ctx, curr_h) {
OomdCGroupContext *old_ctx;
old_ctx = hashmap_get(old_h, ctx->path);
if (!old_ctx)
continue;
ctx->last_pgscan = old_ctx->pgscan;
ctx->mem_pressure_limit = old_ctx->mem_pressure_limit;
ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit;
}
}
void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix) {
char swap[FORMAT_BYTES_MAX];
@ -458,10 +477,12 @@ void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE
fprintf(f,
"%s\tMemory Min: %s\n"
"%s\tMemory Low: %s\n"
"%s\tPgscan: %" PRIu64 "\n",
"%s\tPgscan: %" PRIu64 "\n"
"%s\tLast Pgscan: %" PRIu64 "\n",
strempty(prefix), format_bytes_cgroup_protection(mem_min, sizeof(mem_min), ctx->memory_min),
strempty(prefix), format_bytes_cgroup_protection(mem_low, sizeof(mem_low), ctx->memory_low),
strempty(prefix), ctx->pgscan);
strempty(prefix), ctx->pgscan,
strempty(prefix), ctx->last_pgscan);
}
void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix) {

View File

@ -66,7 +66,8 @@ bool oomd_swap_free_below(const OomdSystemContext *ctx, int threshold_permyriad)
/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end
* (after the smallest values). */
static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
static inline int compare_pgscan_rate_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) {
uint64_t last1, last2;
int r;
assert(c1);
@ -76,7 +77,22 @@ static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1,
if (r != 0)
return r;
r = CMP((*c2)->pgscan, (*c1)->pgscan);
/* If last_pgscan > pgscan, assume the cgroup was recreated and reset last_pgscan to zero. */
last2 = (*c2)->last_pgscan;
if ((*c2)->last_pgscan > (*c2)->pgscan) {
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
(*c2)->last_pgscan, (*c2)->pgscan, (*c2)->path);
last2 = 0;
}
last1 = (*c1)->last_pgscan;
if ((*c1)->last_pgscan > (*c1)->pgscan) {
log_info("Last pgscan %" PRIu64 "greater than current pgscan %" PRIu64 "for %s. Using last pgscan of zero.",
(*c1)->last_pgscan, (*c1)->pgscan, (*c1)->path);
last1 = 0;
}
r = CMP((*c2)->pgscan - last2, (*c1)->pgscan - last1);
if (r != 0)
return r;
@ -107,7 +123,7 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run);
/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */
/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise,
* everything in `h` is a candidate. */
int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run);
int oomd_kill_by_pgscan_rate(Hashmap *h, const char *prefix, bool dry_run);
int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run);
int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret);
@ -119,6 +135,9 @@ int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *
* was no prior data to reference. */
int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path);
/* Update each OomdCGroupContext in `curr_h` with prior interval information from `old_h`. */
void oomd_update_cgroup_contexts_between_hashmaps(Hashmap *old_h, Hashmap *curr_h);
void oomd_dump_swap_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE *f, const char *prefix);
void oomd_dump_system_context(const OomdSystemContext *ctx, FILE *f, const char *prefix);

View File

@ -176,6 +176,53 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) {
}
}
static void test_oomd_update_cgroup_contexts_between_hashmaps(void) {
_cleanup_hashmap_free_ Hashmap *h_old = NULL, *h_new = NULL;
OomdCGroupContext *c_old, *c_new;
char **paths = STRV_MAKE("/0.slice",
"/1.slice");
OomdCGroupContext ctx_old[3] = {
{ .path = paths[0],
.mem_pressure_limit = 5,
.last_hit_mem_pressure_limit = 777,
.pgscan = 57 },
{ .path = paths[1],
.mem_pressure_limit = 6,
.last_hit_mem_pressure_limit = 888,
.pgscan = 42 },
};
OomdCGroupContext ctx_new[3] = {
{ .path = paths[0],
.pgscan = 100 },
{ .path = paths[1],
.pgscan = 101 },
};
assert_se(h_old = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h_old, paths[0], &ctx_old[0]) >= 0);
assert_se(hashmap_put(h_old, paths[1], &ctx_old[1]) >= 0);
assert_se(h_new = hashmap_new(&string_hash_ops));
assert_se(hashmap_put(h_new, paths[0], &ctx_new[0]) >= 0);
assert_se(hashmap_put(h_new, paths[1], &ctx_new[1]) >= 0);
oomd_update_cgroup_contexts_between_hashmaps(h_old, h_new);
assert_se(c_old = hashmap_get(h_old, "/0.slice"));
assert_se(c_new = hashmap_get(h_new, "/0.slice"));
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
assert_se(c_old = hashmap_get(h_old, "/1.slice"));
assert_se(c_new = hashmap_get(h_new, "/1.slice"));
assert_se(c_old->pgscan == c_new->last_pgscan);
assert_se(c_old->mem_pressure_limit == c_new->mem_pressure_limit);
assert_se(c_old->last_hit_mem_pressure_limit == c_new->last_hit_mem_pressure_limit);
}
static void test_oomd_system_context_acquire(void) {
_cleanup_(unlink_tempfilep) char path[] = "/oomdgetsysctxtestXXXXXX";
OomdSystemContext ctx;
@ -325,33 +372,45 @@ static void test_oomd_sort_cgroups(void) {
"/herp.slice/derp.scope",
"/herp.slice/derp.scope/sheep.service",
"/zupa.slice",
"/boop.slice",
"/omitted.slice",
"/avoid.slice");
OomdCGroupContext ctx[6] = {
OomdCGroupContext ctx[7] = {
{ .path = paths[0],
.swap_usage = 20,
.pgscan = 60,
.last_pgscan = 0,
.pgscan = 33,
.current_memory_usage = 10 },
{ .path = paths[1],
.swap_usage = 60,
.pgscan = 40,
.last_pgscan = 33,
.pgscan = 1,
.current_memory_usage = 20 },
{ .path = paths[2],
.swap_usage = 40,
.pgscan = 40,
.last_pgscan = 1,
.pgscan = 33,
.current_memory_usage = 40 },
{ .path = paths[3],
.swap_usage = 10,
.pgscan = 80,
.last_pgscan = 33,
.pgscan = 2,
.current_memory_usage = 10 },
{ .path = paths[4],
.swap_usage = 90,
.pgscan = 100,
.preference = MANAGED_OOM_PREFERENCE_OMIT },
.swap_usage = 11,
.last_pgscan = 33,
.pgscan = 33,
.current_memory_usage = 10 },
{ .path = paths[5],
.swap_usage = 90,
.last_pgscan = 0,
.pgscan = UINT64_MAX,
.preference = MANAGED_OOM_PREFERENCE_OMIT },
{ .path = paths[6],
.swap_usage = 99,
.pgscan = 200,
.last_pgscan = 0,
.pgscan = UINT64_MAX,
.preference = MANAGED_OOM_PREFERENCE_AVOID },
};
@ -361,32 +420,36 @@ static void test_oomd_sort_cgroups(void) {
assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0);
assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0);
assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0);
assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0);
assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0);
assert_se(hashmap_put(h, "/boop.slice", &ctx[4]) >= 0);
assert_se(hashmap_put(h, "/omitted.slice", &ctx[5]) >= 0);
assert_se(hashmap_put(h, "/avoid.slice", &ctx[6]) >= 0);
assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5);
assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 6);
assert_se(sorted_cgroups[0] == &ctx[1]);
assert_se(sorted_cgroups[1] == &ctx[2]);
assert_se(sorted_cgroups[2] == &ctx[0]);
assert_se(sorted_cgroups[3] == &ctx[3]);
assert_se(sorted_cgroups[4] == &ctx[5]);
assert_se(sorted_cgroups[3] == &ctx[4]);
assert_se(sorted_cgroups[4] == &ctx[3]);
assert_se(sorted_cgroups[5] == &ctx[6]);
sorted_cgroups = mfree(sorted_cgroups);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5);
assert_se(sorted_cgroups[0] == &ctx[3]);
assert_se(sorted_cgroups[1] == &ctx[0]);
assert_se(sorted_cgroups[2] == &ctx[2]);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, NULL, &sorted_cgroups) == 6);
assert_se(sorted_cgroups[0] == &ctx[0]);
assert_se(sorted_cgroups[1] == &ctx[2]);
assert_se(sorted_cgroups[2] == &ctx[3]);
assert_se(sorted_cgroups[3] == &ctx[1]);
assert_se(sorted_cgroups[4] == &ctx[5]);
assert_se(sorted_cgroups[4] == &ctx[4]);
assert_se(sorted_cgroups[5] == &ctx[6]);
sorted_cgroups = mfree(sorted_cgroups);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_rate_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2);
assert_se(sorted_cgroups[0] == &ctx[2]);
assert_se(sorted_cgroups[1] == &ctx[1]);
assert_se(sorted_cgroups[2] == 0);
assert_se(sorted_cgroups[3] == 0);
assert_se(sorted_cgroups[4] == 0);
assert_se(sorted_cgroups[5] == 0);
assert_se(sorted_cgroups[6] == 0);
sorted_cgroups = mfree(sorted_cgroups);
}
@ -395,6 +458,7 @@ int main(void) {
test_setup_logging(LOG_DEBUG);
test_oomd_update_cgroup_contexts_between_hashmaps();
test_oomd_system_context_acquire();
test_oomd_pressure_above();
test_oomd_memory_reclaim();