diff --git a/meson.build b/meson.build index 6f9ab9c2e54..3ef86fdbec6 100644 --- a/meson.build +++ b/meson.build @@ -1449,6 +1449,7 @@ foreach term : ['analyze', 'networkd', 'nss-myhostname', 'nss-systemd', + 'oomd', 'portabled', 'pstore', 'quotacheck', @@ -1669,6 +1670,7 @@ subdir('src/analyze') subdir('src/journal-remote') subdir('src/coredump') subdir('src/pstore') +subdir('src/oom') subdir('src/hostname') subdir('src/import') subdir('src/partition') @@ -3678,6 +3680,7 @@ foreach tuple : [ ['DNS-over-TLS(openssl)', conf.get('DNS_OVER_TLS_USE_OPENSSL') == 1], ['coredump'], ['pstore'], + ['oomd'], ['polkit'], ['legacy pkla', install_polkit_pkla], ['efi'], diff --git a/meson_options.txt b/meson_options.txt index d5ce647ae6e..b789e4d77f2 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -97,6 +97,8 @@ option('coredump', type : 'boolean', description : 'install the coredump handler') option('pstore', type : 'boolean', description : 'install the pstore archival tool') +option('oomd', type : 'boolean', + description : 'install the userspace oom killer') option('logind', type : 'boolean', description : 'install the systemd-logind stack') option('hostnamed', type : 'boolean', diff --git a/src/oom/meson.build b/src/oom/meson.build new file mode 100644 index 00000000000..30738090257 --- /dev/null +++ b/src/oom/meson.build @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: LGPL-2.1+ + +systemd_oomd_sources = files(''' + oomd-util.c + oomd-util.h +'''.split()) + +if conf.get('ENABLE_OOMD') == 1 + tests += [ + [['src/oom/test-oomd-util.c', + 'src/oom/oomd-util.c', + 'src/oom/oomd-util.h'], + [], + []] + ] +endif diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c new file mode 100644 index 00000000000..c8786c745f4 --- /dev/null +++ b/src/oom/oomd-util.c @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include +#include + +#include "fd-util.h" +#include "format-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "procfs-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "stdio-util.h" + +DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR( + oomd_cgroup_ctx_hash_ops, + char, + string_hash_func, + string_compare_func, + OomdCGroupContext, + oomd_cgroup_context_free); + +static int log_kill(pid_t pid, int sig, void *userdata) { + log_debug("oomd attempting to kill " PID_FMT " with %s", pid, signal_to_string(sig)); + return 0; +} + +static int increment_oomd_xattr(const char *path, const char *xattr, uint64_t num_procs_killed) { + _cleanup_free_ char *value = NULL; + char buf[DECIMAL_STR_MAX(uint64_t) + 1]; + uint64_t curr_count = 0; + int r; + + assert(path); + assert(xattr); + + r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, xattr, &value); + if (r < 0 && r != -ENODATA) + return r; + + if (!isempty(value)) { + r = safe_atou64(value, &curr_count); + if (r < 0) + return r; + } + + if (curr_count > UINT64_MAX - num_procs_killed) + return -EOVERFLOW; + + xsprintf(buf, "%"PRIu64, curr_count + num_procs_killed); + r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, path, xattr, buf, strlen(buf), 0); + if (r < 0) + return r; + + return 0; +} + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx) { + if (!ctx) + return NULL; + + free(ctx->path); + return mfree(ctx); +} + +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret) { + _cleanup_set_free_ Set *targets = NULL; + OomdCGroupContext *ctx; + char *key; + int r; + + assert(h); + assert(ret); + + targets = set_new(NULL); + if (!targets) + return -ENOMEM; + + HASHMAP_FOREACH_KEY(ctx, key, h) { + if (ctx->memory_pressure.avg10 > ctx->mem_pressure_limit) { + usec_t diff; + + if (ctx->last_hit_mem_pressure_limit == 0) + ctx->last_hit_mem_pressure_limit = now(CLOCK_MONOTONIC); + + diff = now(CLOCK_MONOTONIC) - ctx->last_hit_mem_pressure_limit; + if (diff >= duration) { + r = set_put(targets, ctx); + if (r < 0) + return -ENOMEM; + } + } else + ctx->last_hit_mem_pressure_limit = 0; + } + + if (!set_isempty(targets)) { + *ret = TAKE_PTR(targets); + return 1; + } + + *ret = NULL; + return 0; +} + +bool oomd_memory_reclaim(Hashmap *h) { + uint64_t pgscan = 0, pgscan_of = 0, last_pgscan = 0, last_pgscan_of = 0; + OomdCGroupContext *ctx; + + assert(h); + + /* If sum of all the current pgscan values are greater than the sum of all the last_pgscan values, + * there was reclaim activity. Used along with pressure checks to decide whether to take action. */ + + HASHMAP_FOREACH(ctx, h) { + uint64_t sum; + + sum = pgscan + ctx->pgscan; + if (sum < pgscan || sum < ctx->pgscan) + pgscan_of++; /* count overflows */ + pgscan = sum; + + sum = last_pgscan + ctx->last_pgscan; + if (sum < last_pgscan || sum < ctx->last_pgscan) + last_pgscan_of++; /* count overflows */ + last_pgscan = sum; + } + + /* overflow counts are the same, return sums comparison */ + if (last_pgscan_of == pgscan_of) + return pgscan > last_pgscan; + + return pgscan_of > last_pgscan_of; +} + +bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent) { + uint64_t swap_threshold; + + assert(ctx); + assert(threshold_percent <= 100); + + swap_threshold = ctx->swap_total * threshold_percent / ((uint64_t) 100); + return (ctx->swap_total - ctx->swap_used) < swap_threshold; +} + +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + OomdCGroupContext *item; + size_t k = 0; + + assert(h); + assert(compare_func); + assert(ret); + + sorted = new0(OomdCGroupContext*, hashmap_size(h)); + if (!sorted) + return -ENOMEM; + + HASHMAP_FOREACH(item, h) { + if (item->path && prefix && !path_startswith(item->path, prefix)) + continue; + + sorted[k++] = item; + } + + typesafe_qsort(sorted, k, compare_func); + + *ret = TAKE_PTR(sorted); + + assert(k <= INT_MAX); + return (int) k; +} + +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { + _cleanup_set_free_ Set *pids_killed = NULL; + int r; + + assert(path); + + if (dry_run) { + _cleanup_free_ char *cg_path = NULL; + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &cg_path); + if (r < 0) + return r; + + log_debug("oomd dry-run: Would have tried to kill %s with recurse=%s", cg_path, true_false(recurse)); + return 0; + } + + pids_killed = set_new(NULL); + if (!pids_killed) + return -ENOMEM; + + if (recurse) + r = cg_kill_recursive(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + else + r = cg_kill(SYSTEMD_CGROUP_CONTROLLER, path, SIGKILL, CGROUP_IGNORE_SELF, pids_killed, log_kill, NULL); + if (r < 0) + return r; + + r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed)); + if (r < 0) + log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m"); + + return set_size(pids_killed) != 0; +} + +int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int r; + + assert(h); + + r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted); + if (r < 0) + return r; + + for (int i = 0; i < r; i++) { + if (sorted[i]->pgscan == 0) + break; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) + break; + } + + return r; +} + +int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { + _cleanup_free_ OomdCGroupContext **sorted = NULL; + int r; + + assert(h); + + r = oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted); + if (r < 0) + return r; + + /* Try to kill cgroups with non-zero swap usage until we either succeed in + * killing or we get to a cgroup with no swap usage. */ + for (int i = 0; i < r; i++) { + if (sorted[i]->swap_usage == 0) + break; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) + break; + } + + return r; +} + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *p = NULL, *val = NULL; + bool is_root; + int r; + + assert(path); + assert(ret); + + ctx = new0(OomdCGroupContext, 1); + if (!ctx) + return -ENOMEM; + + is_root = empty_or_root(path); + + r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, "memory.pressure", &p); + if (r < 0) + return log_debug_errno(r, "Error getting cgroup memory pressure path from %s: %m", path); + + r = read_resource_pressure(p, PRESSURE_TYPE_FULL, &ctx->memory_pressure); + if (r < 0) + return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); + + if (is_root) { + r = procfs_memory_get_used(&ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory used from procfs: %m"); + } else { + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.current", &ctx->current_memory_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory.current from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.min", &ctx->memory_min); + if (r < 0) + return log_debug_errno(r, "Error getting memory.min from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.low", &ctx->memory_low); + if (r < 0) + return log_debug_errno(r, "Error getting memory.low from %s: %m", path); + + r = cg_get_attribute_as_uint64(SYSTEMD_CGROUP_CONTROLLER, path, "memory.swap.current", &ctx->swap_usage); + if (r < 0) + return log_debug_errno(r, "Error getting memory.swap.current from %s: %m", path); + + r = cg_get_keyed_attribute(SYSTEMD_CGROUP_CONTROLLER, path, "memory.stat", STRV_MAKE("pgscan"), &val); + if (r < 0) + return log_debug_errno(r, "Error getting pgscan from memory.stat under %s: %m", path); + + r = safe_atou64(val, &ctx->pgscan); + if (r < 0) + return log_debug_errno(r, "Error converting pgscan value to uint64_t: %m"); + } + + ctx->path = strdup(empty_to_root(path)); + if (!ctx->path) + return -ENOMEM; + + *ret = TAKE_PTR(ctx); + return 0; +} + +int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret) { + _cleanup_fclose_ FILE *f = NULL; + OomdSystemContext ctx = {}; + int r; + + assert(proc_swaps_path); + assert(ret); + + f = fopen(proc_swaps_path, "re"); + if (!f) + return -errno; + + (void) fscanf(f, "%*s %*s %*s %*s %*s\n"); + + for (;;) { + uint64_t total, used; + + r = fscanf(f, + "%*s " /* device/file */ + "%*s " /* type of swap */ + "%" PRIu64 " " /* swap size */ + "%" PRIu64 " " /* used */ + "%*s\n", /* priority */ + &total, &used); + + if (r == EOF && feof(f)) + break; + + if (r != 2) { + if (ferror(f)) + return log_debug_errno(errno, "Error reading from %s: %m", proc_swaps_path); + + return log_debug_errno(SYNTHETIC_ERRNO(EIO), + "Failed to parse values from %s: %m", proc_swaps_path); + } + + ctx.swap_total += total * 1024U; + ctx.swap_used += used * 1024U; + } + + *ret = ctx; + return 0; +} + +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *curr_ctx = NULL; + OomdCGroupContext *old_ctx, *ctx; + int r; + + assert(new_h); + assert(path); + + r = oomd_cgroup_context_acquire(path, &curr_ctx); + if (r < 0) + return log_debug_errno(r, "Failed to get OomdCGroupContext for %s: %m", path); + + old_ctx = hashmap_get(old_h, path); + if (old_ctx) { + curr_ctx->last_pgscan = old_ctx->pgscan; + curr_ctx->mem_pressure_limit = old_ctx->mem_pressure_limit; + curr_ctx->last_hit_mem_pressure_limit = old_ctx->last_hit_mem_pressure_limit; + } + + ctx = TAKE_PTR(curr_ctx); + r = hashmap_put(new_h, ctx->path, ctx); + if (r < 0) + return r; + + return 0; +} diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h new file mode 100644 index 00000000000..6d34d91cc2b --- /dev/null +++ b/src/oom/oomd-util.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +#pragma once + +#include + +#include "hashmap.h" +#include "psi-util.h" + +#define GROWING_SIZE_PERCENTILE 80 + +extern const struct hash_ops oomd_cgroup_ctx_hash_ops; + +typedef struct OomdCGroupContext OomdCGroupContext; +typedef struct OomdSystemContext OomdSystemContext; + +typedef int (oomd_compare_t)(OomdCGroupContext * const *, OomdCGroupContext * const *); + +struct OomdCGroupContext { + char *path; + + ResourcePressure memory_pressure; + + uint64_t current_memory_usage; + + uint64_t memory_min; + uint64_t memory_low; + uint64_t swap_usage; + + uint64_t last_pgscan; + uint64_t pgscan; + + /* These are only used by oomd_pressure_above for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; + usec_t last_hit_mem_pressure_limit; +}; + +struct OomdSystemContext { + uint64_t swap_total; + uint64_t swap_used; +}; + +OomdCGroupContext *oomd_cgroup_context_free(OomdCGroupContext *ctx); +DEFINE_TRIVIAL_CLEANUP_FUNC(OomdCGroupContext*, oomd_cgroup_context_free); + +/* All hashmaps used with these functions are expected to be of the form + * key: cgroup paths -> value: OomdCGroupContext. */ + +/* Scans all the OomdCGroupContexts in `h` and returns 1 and a set of pointers to those OomdCGroupContexts in `ret` + * if any of them have exceeded their supplied memory pressure limits for the `duration` length of time. + * `last_hit_mem_pressure_limit` is updated accordingly for each entry when the limit is exceeded, and when it returns + * below the limit. + * Returns 0 and sets `ret` to an empty set if no entries exceeded limits for `duration`. + * Returns -ENOMEM for allocation errors. */ +int oomd_pressure_above(Hashmap *h, usec_t duration, Set **ret); + +/* Sum up current OomdCGroupContexts' pgscan values and last interval's pgscan values in `h`. Returns true if the + * current sum is higher than the last interval's sum (there was some reclaim activity). */ +bool oomd_memory_reclaim(Hashmap *h); + +/* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */ +bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent); + +static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + + if ((*c1)->pgscan > (*c2)->pgscan) + return -1; + else if ((*c1)->pgscan < (*c2)->pgscan) + return 1; + else + return 0; +} + +static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + + if ((*c1)->swap_usage > (*c2)->swap_usage) + return -1; + else if ((*c1)->swap_usage < (*c2)->swap_usage) + return 1; + else + return 0; +} + +/* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`. + * If `prefix` is not NULL, only include OomdCGroupContexts whose paths start with prefix. Otherwise all paths are sorted. + * Returns the number of sorted items; negative on error. */ +int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const char *prefix, OomdCGroupContext ***ret); + +/* Returns a negative value on error, 0 if no processes were killed, or 1 if processes were killed. */ +int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run); + +/* The following oomd_kill_by_* functions return 1 if processes were killed, or negative otherwise. */ +/* If `prefix` is supplied, only cgroups whose paths start with `prefix` are eligible candidates. Otherwise, + * everything in `h` is a candidate. */ +int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run); +int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run); + +int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret); +int oomd_system_context_acquire(const char *proc_swaps_path, OomdSystemContext *ret); + +/* Get the OomdCGroupContext of `path` and insert it into `new_h`. The key for the inserted context will be `path`. + * + * `old_h` is used to get data used to calculate prior interval information. `old_h` can be NULL in which case there + * was no prior data to reference. */ +int oomd_insert_cgroup_context(Hashmap *old_h, Hashmap *new_h, const char *path); diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c new file mode 100644 index 00000000000..5503f8f2e01 --- /dev/null +++ b/src/oom/test-oomd-util.c @@ -0,0 +1,348 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include + +#include "alloc-util.h" +#include "cgroup-setup.h" +#include "cgroup-util.h" +#include "fileio.h" +#include "fs-util.h" +#include "oomd-util.h" +#include "parse-util.h" +#include "path-util.h" +#include "string-util.h" +#include "strv.h" +#include "tests.h" + +static int fork_and_sleep(unsigned sleep_min) { + usec_t n, timeout, ts; + + pid_t pid = fork(); + assert_se(pid >= 0); + + if (pid == 0) { + timeout = sleep_min * USEC_PER_MINUTE; + ts = now(CLOCK_MONOTONIC); + while (true) { + n = now(CLOCK_MONOTONIC); + if (ts + timeout < n) { + log_error("Child timed out waiting to be killed"); + abort(); + } + sleep(1); + } + } + + return pid; +} + +static void test_oomd_cgroup_kill(void) { + _cleanup_free_ char *cgroup_root = NULL, *cgroup = NULL; + int pid[2]; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup_root) >= 0); + + /* Create another cgroup below this one for the pids we forked off. We need this to be managed + * by the test so that pid1 doesn't delete it before we can read the xattrs. */ + cgroup = path_join(cgroup_root, "oomdkilltest"); + assert(cgroup); + + /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities */ + if (cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "test", 4, 0) == -EPERM) + return (void) log_tests_skipped("no permissions to set user xattrs"); + + /* Do this twice to also check the increment behavior on the xattrs */ + for (int i = 0; i < 2; i++) { + _cleanup_free_ char *v = NULL; + int r; + + for (int j = 0; j < 2; j++) { + pid[j] = fork_and_sleep(5); + assert_se(cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, cgroup, pid[j]) >= 0); + } + + r = oomd_cgroup_kill(cgroup, false /* recurse */, false /* dry run */); + if (r <= 0) { + log_debug_errno(r, "Failed to kill processes under %s: %m", cgroup); + abort(); + } + + /* Wait a bit since processes may take some time to be cleaned up. */ + sleep(2); + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true); + + assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.systemd_oomd_kill", &v) >= 0); + assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0); + } +} + +static void test_oomd_cgroup_context_acquire_and_insert(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + OomdCGroupContext *c1, *c2; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + if (!is_pressure_supported()) + return (void) log_tests_skipped("system does not support pressure"); + + if (cg_all_unified() <= 0) + return (void) log_tests_skipped("cgroups are not running in unified mode"); + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(streq(ctx->path, cgroup)); + assert_se(ctx->memory_pressure.avg10 == 0); + assert_se(ctx->memory_pressure.avg60 == 0); + assert_se(ctx->memory_pressure.avg300 == 0); + assert_se(ctx->memory_pressure.total == 0); + assert_se(ctx->current_memory_usage > 0); + assert_se(ctx->memory_min == 0); + assert_se(ctx->memory_low == 0); + assert_se(ctx->swap_usage == 0); + assert_se(ctx->last_pgscan == 0); + assert_se(ctx->pgscan == 0); + ctx = oomd_cgroup_context_free(ctx); + + /* Test the root cgroup */ + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + assert_se(streq(ctx->path, "/")); + assert_se(ctx->current_memory_usage > 0); + + /* Test hashmap inserts */ + assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(NULL, h1, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + assert_se(c1); + + /* make sure certain values from h1 get updated in h2 */ + c1->pgscan = 5555; + c1->mem_pressure_limit = 6789; + c1->last_hit_mem_pressure_limit = 42; + assert_se(h2 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); + assert_se(oomd_insert_cgroup_context(h1, h2, cgroup) == 0); + c1 = hashmap_get(h1, cgroup); + c2 = hashmap_get(h2, cgroup); + assert_se(c1); + assert_se(c2); + assert_se(c1 != c2); + assert_se(c2->last_pgscan == 5555); + assert_se(c2->mem_pressure_limit == 6789); + assert_se(c2->last_hit_mem_pressure_limit == 42); +} + +static void test_oomd_system_context_acquire(void) { + _cleanup_(unlink_tempfilep) char path[] = "/oomdgetsysctxtestXXXXXX"; + OomdSystemContext ctx; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); + + assert_se(mkstemp(path)); + + assert_se(oomd_system_context_acquire("/verylikelynonexistentpath", &ctx) == -ENOENT); + + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.swap_total == 0); + assert_se(ctx.swap_used == 0); + + assert_se(write_string_file(path, "some\nwords\nacross\nmultiple\nlines", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.swap_total == 0); + assert_se(ctx.swap_used == 0); + + assert_se(write_string_file(path, "Filename Type Size Used Priority\n" + "/swapvol/swapfile file 18971644 0 -3\n" + "/dev/vda2 partition 1999868 993780 -2", WRITE_STRING_FILE_CREATE) == 0); + assert_se(oomd_system_context_acquire(path, &ctx) == 0); + assert_se(ctx.swap_total == 21474828288); + assert_se(ctx.swap_used == 1017630720); +} + +static void test_oomd_pressure_above(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL, *h2 = NULL; + _cleanup_set_free_ Set *t1 = NULL, *t2 = NULL, *t3 = NULL; + OomdCGroupContext ctx[2], *c; + loadavg_t threshold; + + assert_se(store_loadavg_fixed_point(80, 0, &threshold) == 0); + + /* /herp.slice */ + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(99, 99, &(ctx[0].memory_pressure.avg300)) == 0); + ctx[0].mem_pressure_limit = threshold; + + /* /derp.slice */ + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg10)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg60)) == 0); + assert_se(store_loadavg_fixed_point(1, 11, &(ctx[1].memory_pressure.avg300)) == 0); + ctx[1].mem_pressure_limit = threshold; + + + /* High memory pressure */ + assert_se(h1 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h1, "/herp.slice", &ctx[0]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t1) == 1); + assert_se(set_contains(t1, &ctx[0]) == true); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->last_hit_mem_pressure_limit > 0); + + /* Low memory pressure */ + assert_se(h2 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h2, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h2, 0 /* duration */, &t2) == 0); + assert_se(t2 == NULL); + assert_se(c = hashmap_get(h2, "/derp.slice")); + assert_se(c->last_hit_mem_pressure_limit == 0); + + /* High memory pressure w/ multiple cgroups */ + assert_se(hashmap_put(h1, "/derp.slice", &ctx[1]) >= 0); + assert_se(oomd_pressure_above(h1, 0 /* duration */, &t3) == 1); + assert_se(set_contains(t3, &ctx[0]) == true); + assert_se(set_size(t3) == 1); + assert_se(c = hashmap_get(h1, "/herp.slice")); + assert_se(c->last_hit_mem_pressure_limit > 0); + assert_se(c = hashmap_get(h1, "/derp.slice")); + assert_se(c->last_hit_mem_pressure_limit == 0); +} + +static void test_oomd_memory_reclaim(void) { + _cleanup_hashmap_free_ Hashmap *h1 = NULL; + char **paths = STRV_MAKE("/0.slice", + "/1.slice", + "/2.slice", + "/3.slice", + "/4.slice"); + + OomdCGroupContext ctx[5] = { + { .path = paths[0], + .last_pgscan = 100, + .pgscan = 100 }, + { .path = paths[1], + .last_pgscan = 100, + .pgscan = 100 }, + { .path = paths[2], + .last_pgscan = 77, + .pgscan = 33 }, + { .path = paths[3], + .last_pgscan = UINT64_MAX, + .pgscan = 100 }, + { .path = paths[4], + .last_pgscan = 100, + .pgscan = UINT64_MAX }, + }; + + assert_se(h1 = hashmap_new(&string_hash_ops)); + assert_se(hashmap_put(h1, paths[0], &ctx[0]) >= 0); + assert_se(hashmap_put(h1, paths[1], &ctx[1]) >= 0); + assert_se(oomd_memory_reclaim(h1) == false); + + assert_se(hashmap_put(h1, paths[2], &ctx[2]) >= 0); + assert_se(oomd_memory_reclaim(h1) == false); + + assert_se(hashmap_put(h1, paths[4], &ctx[4]) >= 0); + assert_se(oomd_memory_reclaim(h1) == true); + + assert_se(hashmap_put(h1, paths[3], &ctx[3]) >= 0); + assert_se(oomd_memory_reclaim(h1) == false); +} + +static void test_oomd_swap_free_below(void) { + OomdSystemContext ctx = (OomdSystemContext) { + .swap_total = 20971512 * 1024U, + .swap_used = 20971440 * 1024U, + }; + assert_se(oomd_swap_free_below(&ctx, 20) == true); + + ctx = (OomdSystemContext) { + .swap_total = 20971512 * 1024U, + .swap_used = 3310136 * 1024U, + }; + assert_se(oomd_swap_free_below(&ctx, 20) == false); +} + +static void test_oomd_sort_cgroups(void) { + _cleanup_hashmap_free_ Hashmap *h = NULL; + _cleanup_free_ OomdCGroupContext **sorted_cgroups; + char **paths = STRV_MAKE("/herp.slice", + "/herp.slice/derp.scope", + "/herp.slice/derp.scope/sheep.service", + "/zupa.slice"); + + OomdCGroupContext ctx[4] = { + { .path = paths[0], + .swap_usage = 20, + .pgscan = 60 }, + { .path = paths[1], + .swap_usage = 60, + .pgscan = 40 }, + { .path = paths[2], + .swap_usage = 40, + .pgscan = 20 }, + { .path = paths[3], + .swap_usage = 10, + .pgscan = 80 }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); + + assert_se(hashmap_put(h, "/herp.slice", &ctx[0]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0); + assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0); + + assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[0]); + assert_se(sorted_cgroups[3] == &ctx[3]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, NULL, &sorted_cgroups) == 4); + assert_se(sorted_cgroups[0] == &ctx[3]); + assert_se(sorted_cgroups[1] == &ctx[0]); + assert_se(sorted_cgroups[2] == &ctx[1]); + assert_se(sorted_cgroups[3] == &ctx[2]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, "/herp.slice/derp.scope", &sorted_cgroups) == 2); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); + sorted_cgroups = mfree(sorted_cgroups); +} + +int main(void) { + int r; + + test_setup_logging(LOG_DEBUG); + + test_oomd_system_context_acquire(); + test_oomd_pressure_above(); + test_oomd_memory_reclaim(); + test_oomd_swap_free_below(); + test_oomd_sort_cgroups(); + + /* The following tests operate on live cgroups */ + + r = enter_cgroup_root(NULL); + if (r < 0) + return log_tests_skipped_errno(r, "failed to enter a test cgroup scope"); + + test_oomd_cgroup_kill(); + test_oomd_cgroup_context_acquire_and_insert(); + + return 0; +} diff --git a/src/shared/tests.c b/src/shared/tests.c index a5cb486c99e..fe6d9dfbd50 100644 --- a/src/shared/tests.c +++ b/src/shared/tests.c @@ -254,7 +254,7 @@ static int allocate_scope(void) { return 0; } -int enter_cgroup_subroot(char **ret_cgroup) { +static int enter_cgroup(char **ret_cgroup, bool enter_subroot) { _cleanup_free_ char *cgroup_root = NULL, *cgroup_subroot = NULL; CGroupMask supported; int r; @@ -268,7 +268,13 @@ int enter_cgroup_subroot(char **ret_cgroup) { return log_warning_errno(r, "cg_pid_get_path(NULL, 0, ...) failed: %m"); assert(r >= 0); - assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0); + if (enter_subroot) + assert_se(asprintf(&cgroup_subroot, "%s/%" PRIx64, cgroup_root, random_u64()) >= 0); + else { + cgroup_subroot = strdup(cgroup_root); + assert_se(cgroup_subroot != NULL); + } + assert_se(cg_mask_supported(&supported) >= 0); /* If this fails, then we don't mind as the later cgroup operations will fail too, and it's fine if @@ -287,3 +293,11 @@ int enter_cgroup_subroot(char **ret_cgroup) { return 0; } + +int enter_cgroup_subroot(char **ret_cgroup) { + return enter_cgroup(ret_cgroup, true); +} + +int enter_cgroup_root(char **ret_cgroup) { + return enter_cgroup(ret_cgroup, false); +} diff --git a/src/shared/tests.h b/src/shared/tests.h index 6817ef48606..505ca39775c 100644 --- a/src/shared/tests.h +++ b/src/shared/tests.h @@ -20,6 +20,7 @@ static inline bool manager_errno_skip_test(int r) { char* setup_fake_runtime_dir(void); int enter_cgroup_subroot(char **ret_cgroup); +int enter_cgroup_root(char **ret_cgroup); int get_testdata_dir(const char *suffix, char **ret); const char* get_catalog_dir(void); bool slow_tests_enabled(void);