mirror of
https://github.com/systemd/systemd.git
synced 2024-10-31 16:21:26 +03:00
Merge pull request #18361 from anitazha/oomdconfigtime
oom: some improvements and fixes
This commit is contained in:
commit
a306ec1de9
@ -65,13 +65,23 @@
|
||||
will take action. A unit can override this value with <varname>ManagedOOMMemoryPressureLimitPercent=</varname>.
|
||||
The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks
|
||||
in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the
|
||||
limit set for more than 30 seconds, <command>systemd-oomd</command> will act on eligible descendant cgroups,
|
||||
limit set for longer than the duration set by <varname>DefaultMemoryPressureDurationSec=</varname>,
|
||||
<command>systemd-oomd</command> will act on eligible descendant cgroups,
|
||||
starting from the ones with the most reclaim activity to the least reclaim activity. Which cgroups are
|
||||
monitored and what action gets taken depends on what the unit has configured for
|
||||
<varname>ManagedOOMMemoryPressure=</varname>. Takes a percentage value between 0% and 100%, inclusive.
|
||||
Defaults to 60%.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>DefaultMemoryPressureDurationSec=</varname></term>
|
||||
|
||||
<listitem><para>Sets the amount of time a unit's cgroup needs to have exceeded memory pressure limits before
|
||||
<command>systemd-oomd</command> will take action. Memory pressure limits are defined by
|
||||
<varname>DefaultMemoryPressureLimitPercent=</varname> and <varname>ManagedOOMMemoryPressureLimitPercent=</varname>.
|
||||
Defaults to 30 seconds when this property is unset or set to 0.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
|
@ -56,8 +56,8 @@
|
||||
|
||||
<para>You will need a kernel compiled with PSI support. This is available in Linux 4.20 and above.</para>
|
||||
|
||||
<para>The system must also have swap enabled for <command>systemd-oomd</command> to function correctly. With swap
|
||||
enabled, the system spends enough time swapping pages to let <command>systemd-oomd</command> react.
|
||||
<para>It is highly recommended for the system to have swap enabled for <command>systemd-oomd</command> to function
|
||||
optimally. With swap enabled, the system spends enough time swapping pages to let <command>systemd-oomd</command> react.
|
||||
Without swap, the system enters a livelocked state much more quickly and may prevent <command>systemd-oomd</command>
|
||||
from responding in a reasonable amount of time. See
|
||||
<ulink url="https://chrisdown.name/2018/01/02/in-defence-of-swap.html">"In defence of swap: common misconceptions"</ulink>
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "cgroup-util.h"
|
||||
#include "fd-util.h"
|
||||
#include "fileio.h"
|
||||
#include "memory-util.h"
|
||||
#include "oomd-manager-bus.h"
|
||||
#include "oomd-manager.h"
|
||||
#include "path-util.h"
|
||||
@ -294,9 +295,15 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
|
||||
return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts");
|
||||
|
||||
r = oomd_system_context_acquire("/proc/swaps", &m->system_context);
|
||||
/* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */
|
||||
if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
|
||||
/* If there aren't units depending on swap actions, the only error we exit on is ENOMEM.
|
||||
* Allow ENOENT in the event that swap is disabled on the system. */
|
||||
if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
|
||||
return log_error_errno(r, "Failed to acquire system context");
|
||||
else if (r == -ENOENT)
|
||||
zero(m->system_context);
|
||||
|
||||
if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts))
|
||||
m->last_reclaim_at = usec_now;
|
||||
|
||||
/* If we're still recovering from a kill, don't try to kill again yet */
|
||||
if (m->post_action_delay_start > 0) {
|
||||
@ -306,16 +313,16 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
|
||||
m->post_action_delay_start = 0;
|
||||
}
|
||||
|
||||
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, PRESSURE_DURATION_USEC, &targets);
|
||||
r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
|
||||
if (r == -ENOMEM)
|
||||
return log_error_errno(r, "Failed to check if memory pressure exceeded limits");
|
||||
else if (r == 1) {
|
||||
/* Check if there was reclaim activity in the last interval. The concern is the following case:
|
||||
/* Check if there was reclaim activity in the given interval. The concern is the following case:
|
||||
* Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
|
||||
* cgroup. Even after this, well-behaved processes will fault in recently resident pages and
|
||||
* this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
|
||||
* to kill something (it won't help anyways). */
|
||||
if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) {
|
||||
if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) {
|
||||
_cleanup_hashmap_free_ Hashmap *candidates = NULL;
|
||||
OomdCGroupContext *t;
|
||||
|
||||
@ -325,7 +332,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo
|
||||
|
||||
SET_FOREACH(t, targets) {
|
||||
log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity",
|
||||
t->path, LOAD_INT(t->mem_pressure_limit), PRESSURE_DURATION_USEC / USEC_PER_SEC);
|
||||
t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC);
|
||||
|
||||
r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run);
|
||||
if (r == -ENOMEM)
|
||||
@ -471,7 +478,7 @@ static int manager_connect_bus(Manager *m) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit) {
|
||||
int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec) {
|
||||
unsigned long l;
|
||||
int r;
|
||||
|
||||
@ -487,6 +494,8 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
|
||||
|
||||
r = manager_connect_bus(m);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -505,6 +514,7 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur
|
||||
int manager_get_dump_string(Manager *m, char **ret) {
|
||||
_cleanup_free_ char *dump = NULL;
|
||||
_cleanup_fclose_ FILE *f = NULL;
|
||||
char buf[FORMAT_TIMESPAN_MAX];
|
||||
OomdCGroupContext *c;
|
||||
size_t size;
|
||||
char *key;
|
||||
@ -521,10 +531,12 @@ int manager_get_dump_string(Manager *m, char **ret) {
|
||||
"Dry Run: %s\n"
|
||||
"Swap Used Limit: %u%%\n"
|
||||
"Default Memory Pressure Limit: %lu%%\n"
|
||||
"Default Memory Pressure Duration: %s\n"
|
||||
"System Context:\n",
|
||||
yes_no(m->dry_run),
|
||||
m->swap_used_limit,
|
||||
LOAD_INT(m->default_mem_pressure_limit));
|
||||
LOAD_INT(m->default_mem_pressure_limit),
|
||||
format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC));
|
||||
oomd_dump_system_context(&m->system_context, f, "\t");
|
||||
|
||||
fprintf(f, "Swap Monitored CGroups:\n");
|
||||
|
@ -16,10 +16,11 @@
|
||||
* percentage of time all tasks were delayed (i.e. unproductive).
|
||||
* Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in
|
||||
* system.slice are assumed to be less latency sensitive. */
|
||||
#define PRESSURE_DURATION_USEC (30 * USEC_PER_SEC)
|
||||
#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC)
|
||||
#define DEFAULT_MEM_PRESSURE_LIMIT 60
|
||||
#define DEFAULT_SWAP_USED_LIMIT 90
|
||||
|
||||
#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC)
|
||||
#define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC)
|
||||
|
||||
typedef struct Manager Manager;
|
||||
@ -33,6 +34,7 @@ struct Manager {
|
||||
bool dry_run;
|
||||
unsigned swap_used_limit;
|
||||
loadavg_t default_mem_pressure_limit;
|
||||
usec_t default_mem_pressure_duration_usec;
|
||||
|
||||
/* k: cgroup paths -> v: OomdCGroupContext
|
||||
* Used to detect when to take action. */
|
||||
@ -41,6 +43,7 @@ struct Manager {
|
||||
|
||||
OomdSystemContext system_context;
|
||||
|
||||
usec_t last_reclaim_at;
|
||||
usec_t post_action_delay_start;
|
||||
|
||||
sd_event_source *cgroup_context_event_source;
|
||||
@ -53,7 +56,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free);
|
||||
|
||||
int manager_new(Manager **ret);
|
||||
|
||||
int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit);
|
||||
int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec);
|
||||
|
||||
int manager_get_dump_string(Manager *m, char **ret);
|
||||
|
||||
|
@ -31,6 +31,7 @@ struct OomdCGroupContext {
|
||||
|
||||
/* These are only used by oomd_pressure_above for acting on high memory pressure. */
|
||||
loadavg_t mem_pressure_limit;
|
||||
usec_t mem_pressure_duration_usec;
|
||||
usec_t last_hit_mem_pressure_limit;
|
||||
};
|
||||
|
||||
|
@ -19,11 +19,13 @@
|
||||
static bool arg_dry_run = false;
|
||||
static int arg_swap_used_limit = -1;
|
||||
static int arg_mem_pressure_limit = -1;
|
||||
static usec_t arg_mem_pressure_usec = 0;
|
||||
|
||||
static int parse_config(void) {
|
||||
static const ConfigTableItem items[] = {
|
||||
{ "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit },
|
||||
{ "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit },
|
||||
{ "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec },
|
||||
{}
|
||||
};
|
||||
|
||||
@ -140,10 +142,8 @@ static int run(int argc, char *argv[]) {
|
||||
return log_error_errno(r, "Failed to get SwapTotal from /proc/meminfo: %m");
|
||||
|
||||
r = safe_atollu(swap, &s);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to parse SwapTotal from /proc/meminfo: %s: %m", swap);
|
||||
if (s == 0)
|
||||
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires swap to operate");
|
||||
if (r < 0 || s == 0)
|
||||
log_warning("Swap is currently not detected; memory pressure usage will be degraded");
|
||||
|
||||
if (!is_pressure_supported())
|
||||
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported");
|
||||
@ -160,7 +160,7 @@ static int run(int argc, char *argv[]) {
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to create manager: %m");
|
||||
|
||||
r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit);
|
||||
r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit, arg_mem_pressure_usec);
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to start up daemon: %m");
|
||||
|
||||
|
@ -14,3 +14,4 @@
|
||||
[OOM]
|
||||
#SwapUsedLimitPercent=90%
|
||||
#DefaultMemoryPressureLimitPercent=60%
|
||||
#DefaultMemoryPressureDurationSec=30s
|
||||
|
@ -159,6 +159,11 @@ static void test_oomd_system_context_acquire(void) {
|
||||
assert_se(ctx.swap_total == 0);
|
||||
assert_se(ctx.swap_used == 0);
|
||||
|
||||
assert_se(write_string_file(path, "Filename Type Size Used Priority", WRITE_STRING_FILE_CREATE) == 0);
|
||||
assert_se(oomd_system_context_acquire(path, &ctx) == 0);
|
||||
assert_se(ctx.swap_total == 0);
|
||||
assert_se(ctx.swap_used == 0);
|
||||
|
||||
assert_se(write_string_file(path, "Filename Type Size Used Priority\n"
|
||||
"/swapvol/swapfile file 18971644 0 -3\n"
|
||||
"/dev/vda2 partition 1999868 993780 -2", WRITE_STRING_FILE_CREATE) == 0);
|
||||
@ -268,6 +273,12 @@ static void test_oomd_swap_free_below(void) {
|
||||
.swap_used = 3310136 * 1024U,
|
||||
};
|
||||
assert_se(oomd_swap_free_below(&ctx, 20) == false);
|
||||
|
||||
ctx = (OomdSystemContext) {
|
||||
.swap_total = 0,
|
||||
.swap_used = 0,
|
||||
};
|
||||
assert_se(oomd_swap_free_below(&ctx, 20) == false);
|
||||
}
|
||||
|
||||
static void test_oomd_sort_cgroups(void) {
|
||||
|
@ -6,7 +6,6 @@ systemd-analyze log-level debug
|
||||
systemd-analyze log-target console
|
||||
|
||||
# Loose checks to ensure the environment has the necessary features for systemd-oomd
|
||||
[[ "$( awk '/SwapTotal/ { print $2 }' /proc/meminfo )" != "0" ]] || echo "no swap" >> /skipped
|
||||
[[ -e /proc/pressure ]] || echo "no PSI" >> /skipped
|
||||
cgroup_type=$(stat -fc %T /sys/fs/cgroup/)
|
||||
if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]; then
|
||||
@ -14,12 +13,15 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]
|
||||
fi
|
||||
[[ -e /skipped ]] && exit 0 || true
|
||||
|
||||
systemctl start testsuite-56-testbloat.service
|
||||
echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf
|
||||
|
||||
systemctl start testsuite-56-testchill.service
|
||||
systemctl start testsuite-56-testbloat.service
|
||||
|
||||
# Verify systemd-oomd is monitoring the expected units
|
||||
oomctl | grep "/testsuite-56-workload.slice"
|
||||
oomctl | grep "1%"
|
||||
oomctl | grep "Default Memory Pressure Duration: 5s"
|
||||
|
||||
# systemd-oomd watches for elevated pressure for 30 seconds before acting.
|
||||
# It can take time to build up pressure so either wait 5 minutes or for the service to fail.
|
||||
|
Loading…
Reference in New Issue
Block a user