From 158fe190afe37b222c9dc2c53bd7be426b92ef89 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Feb 2023 16:44:24 +0100 Subject: [PATCH 1/3] sd-event: add high-level sd_event_add_memory_pressure() event source Typically, in reasonably complex programs we want to realease various caches (such as glibc allocation caches) in case of memory pressure. Let's add explicit infrastructure for that to sd-event, that can hook Linux' Pressure Stall Information (PSI) logic with our event loop. This adds sd_event_add_memory_pressure() as easy, one-step API to install an even source that is called under memory pressure. The parameters which file to watch (the per-cgroup PSI file, or the system-wide file /proc/pressure/memory) can be configured via env vars. The idea is that the service manager sooner or later gains controls for setting this up correctly. Alternatively to the PSI a similar logic is supported but instead of waiting for POLLPRI on a procfs/cgroupfs fd we'll wait for POLLIN on FIFO or AF_UNIX sockets. This is useful for testing, and possibly in other environments, for example to hook up this protocol directly with GNOME's low memory monitor. By default this watches on the cgroup-local PSI so that we aren't affected by pressure on cgroups we are not related to. --- catalog/systemd.catalog.in | 15 + src/basic/psi-util.h | 5 + src/libsystemd/sd-event/event-source.h | 12 + src/libsystemd/sd-event/sd-event.c | 582 ++++++++++++++++++++++++- src/systemd/sd-event.h | 5 + src/systemd/sd-messages.h | 3 + 6 files changed, 621 insertions(+), 1 deletion(-) diff --git a/catalog/systemd.catalog.in b/catalog/systemd.catalog.in index 975e77fcec1..82d4820b806 100644 --- a/catalog/systemd.catalog.in +++ b/catalog/systemd.catalog.in @@ -549,3 +549,18 @@ Whenever the system transitions to a new runtime phase, the specified PCR is extended with a different string, to ensure that security policies for TPM-bound secrets and other resources are limited to specific phases of the runtime. + +-- f9b0be465ad540d0850ad32172d57c21 +Subject: Memory Trimmed +Defined-By: systemd +Support: %SUPPORT_URL% + +Memory of process @_PID@ (@_COMM@) has been trimmed. + +Either on user request or as result of a memory pressure event, memory of the +process has been trimmed, returning unneded allocation caches and other +resources back to the OS kernel, making them available for other components of +the OS. + +@TRIMMED_BYTES@ of memory were returned to the OS, which took @TRIMMED_USEC@ +micro-seconds (µs). diff --git a/src/basic/psi-util.h b/src/basic/psi-util.h index 415fbbdd47c..558a130996b 100644 --- a/src/basic/psi-util.h +++ b/src/basic/psi-util.h @@ -28,3 +28,8 @@ int read_resource_pressure(const char *path, PressureType type, ResourcePressure /* Was the kernel compiled with CONFIG_PSI=y? 1 if yes, 0 if not, negative on error. */ int is_pressure_supported(void); + +/* Default parameters for memory pressure watch logic in sd-event and PID 1 */ +#define MEMORY_PRESSURE_DEFAULT_TYPE "some" +#define MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC (100 * USEC_PER_MSEC) +#define MEMORY_PRESSURE_DEFAULT_WINDOW_USEC USEC_PER_SEC diff --git a/src/libsystemd/sd-event/event-source.h b/src/libsystemd/sd-event/event-source.h index 6092652d0fa..f4e38d78d08 100644 --- a/src/libsystemd/sd-event/event-source.h +++ b/src/libsystemd/sd-event/event-source.h @@ -27,6 +27,7 @@ typedef enum EventSourceType { SOURCE_EXIT, SOURCE_WATCHDOG, SOURCE_INOTIFY, + SOURCE_MEMORY_PRESSURE, _SOURCE_EVENT_SOURCE_TYPE_MAX, _SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL, } EventSourceType; @@ -129,6 +130,17 @@ struct sd_event_source { struct inode_data *inode_data; LIST_FIELDS(sd_event_source, by_inode_data); } inotify; + struct { + int fd; + sd_event_handler_t callback; + void *write_buffer; + size_t write_buffer_size; + uint32_t events, revents; + LIST_FIELDS(sd_event_source, write_list); + bool registered:1; + bool locked:1; + bool in_write_list:1; + } memory_pressure; }; }; diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c index 307fdde311d..2f9b0ecda06 100644 --- a/src/libsystemd/sd-event/sd-event.c +++ b/src/libsystemd/sd-event/sd-event.c @@ -7,6 +7,7 @@ #include "sd-daemon.h" #include "sd-event.h" #include "sd-id128.h" +#include "sd-messages.h" #include "alloc-util.h" #include "env-util.h" @@ -15,15 +16,22 @@ #include "fs-util.h" #include "glyph-util.h" #include "hashmap.h" +#include "hexdecoct.h" #include "list.h" #include "logarithm.h" #include "macro.h" +#include "mallinfo-util.h" #include "memory-util.h" +#include "missing_magic.h" #include "missing_syscall.h" +#include "path-util.h" #include "prioq.h" #include "process-util.h" +#include "psi-util.h" #include "set.h" #include "signal-util.h" +#include "socket-util.h" +#include "stat-util.h" #include "string-table.h" #include "string-util.h" #include "strxcpyx.h" @@ -63,6 +71,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] [SOURCE_EXIT] = "exit", [SOURCE_WATCHDOG] = "watchdog", [SOURCE_INOTIFY] = "inotify", + [SOURCE_MEMORY_PRESSURE] = "memory-pressure", }; DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); @@ -85,7 +94,8 @@ DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int); SOURCE_TIME_BOOTTIME_ALARM, \ SOURCE_SIGNAL, \ SOURCE_DEFER, \ - SOURCE_INOTIFY) + SOURCE_INOTIFY, \ + SOURCE_MEMORY_PRESSURE) /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put(). * Time sources and ratelimited sources can be passed, so effectively this is the same as the @@ -130,6 +140,9 @@ struct sd_event { /* A list of inotify objects that already have events buffered which aren't processed yet */ LIST_HEAD(struct inotify_data, buffered_inotify_data_list); + /* A list of memory pressure event sources that still need their subscription string written */ + LIST_HEAD(sd_event_source, memory_pressure_write_list); + pid_t original_pid; uint64_t iteration; @@ -524,6 +537,65 @@ static int source_child_pidfd_register(sd_event_source *s, int enabled) { return 0; } +static void source_memory_pressure_unregister(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (event_pid_changed(s->event)) + return; + + if (!s->memory_pressure.registered) + return; + + if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0) + log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m", + strna(s->description), event_source_type_to_string(s->type)); + + s->memory_pressure.registered = false; +} + +static int source_memory_pressure_register(sd_event_source *s, int enabled) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(enabled != SD_EVENT_OFF); + + struct epoll_event ev = { + .events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT : + (s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)), + .data.ptr = s, + }; + + if (epoll_ctl(s->event->epoll_fd, + s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD, + s->memory_pressure.fd, &ev) < 0) + return -errno; + + s->memory_pressure.registered = true; + return 0; +} + +static void source_memory_pressure_add_to_write_list(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (s->memory_pressure.in_write_list) + return; + + LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s); + s->memory_pressure.in_write_list = true; +} + +static void source_memory_pressure_remove_from_write_list(sd_event_source *s) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (!s->memory_pressure.in_write_list) + return; + + LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s); + s->memory_pressure.in_write_list = false; +} + static clockid_t event_source_type_to_clock(EventSourceType t) { switch (t) { @@ -947,6 +1019,11 @@ static void source_disconnect(sd_event_source *s) { break; } + case SOURCE_MEMORY_PRESSURE: + source_memory_pressure_remove_from_write_list(s); + source_memory_pressure_unregister(s); + break; + default: assert_not_reached(); } @@ -1017,6 +1094,11 @@ static sd_event_source* source_free(sd_event_source *s) { s->child.pidfd = safe_close(s->child.pidfd); } + if (s->type == SOURCE_MEMORY_PRESSURE) { + s->memory_pressure.fd = safe_close(s->memory_pressure.fd); + s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer); + } + if (s->destroy_callback) s->destroy_callback(s->userdata); @@ -1092,6 +1174,7 @@ static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType t [SOURCE_POST] = endoffsetof_field(sd_event_source, post), [SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit), [SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify), + [SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure), }; sd_event_source *s; @@ -1771,6 +1854,257 @@ _public_ int sd_event_add_exit( return 0; } +int sd_event_trim_memory(void) { + int r; + + /* A default implementation of a memory pressure callback. Simply releases our own allocation caches + * and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a + * NULL callback parameter. */ + + log_debug("Memory pressure event, trimming malloc() memory."); + +#if HAVE_GENERIC_MALLINFO + generic_mallinfo before_mallinfo = generic_mallinfo_get(); +#endif + + usec_t before_timestamp = now(CLOCK_MONOTONIC); + hashmap_trim_pools(); + r = malloc_trim(0); + usec_t after_timestamp = now(CLOCK_MONOTONIC); + + if (r > 0) + log_debug("Successfully trimmed some memory."); + else + log_debug("Couldn't trim any memory."); + + usec_t period = after_timestamp - before_timestamp; + +#if HAVE_GENERIC_MALLINFO + generic_mallinfo after_mallinfo = generic_mallinfo_get(); + size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) + + LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena); + log_struct(LOG_DEBUG, + LOG_MESSAGE("Memory trimming took %s, returned %s to OS.", + FORMAT_TIMESPAN(period, 0), + FORMAT_BYTES(l)), + "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR, + "TRIMMED_BYTES=%zu", l, + "TRIMMED_USEC=" USEC_FMT, period); +#else + log_struct(LOG_DEBUG, + LOG_MESSAGE("Memory trimming took %s.", + FORMAT_TIMESPAN(period, 0)), + "MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR, + "TRIMMED_USEC=" USEC_FMT, period); +#endif + + return 0; +} + +static int memory_pressure_callback(sd_event_source *s, void *userdata) { + assert(s); + + sd_event_trim_memory(); + return 0; +} + +_public_ int sd_event_add_memory_pressure( + sd_event *e, + sd_event_source **ret, + sd_event_handler_t callback, + void *userdata) { + + _cleanup_free_ char *w = NULL; + _cleanup_(source_freep) sd_event_source *s = NULL; + _cleanup_close_ int path_fd = -1, fd = -1; + _cleanup_free_ void *write_buffer = NULL; + const char *watch, *watch_fallback, *env; + size_t write_buffer_size = 0; + struct stat st; + uint32_t events; + bool locked; + int r; + + assert_return(e, -EINVAL); + assert_return(e = event_resolve(e), -ENOPKG); + assert_return(e->state != SD_EVENT_FINISHED, -ESTALE); + assert_return(!event_pid_changed(e), -ECHILD); + + if (!callback) + callback = memory_pressure_callback; + + s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE); + if (!s) + return -ENOMEM; + + s->wakeup = WAKEUP_EVENT_SOURCE; + s->memory_pressure.callback = callback; + s->userdata = userdata; + s->enabled = SD_EVENT_ON; + s->memory_pressure.fd = -EBADF; + + env = secure_getenv("MEMORY_PRESSURE_WATCH"); + if (env) { + if (isempty(env) || path_equal(env, "/dev/null")) + return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), + "Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH."); + + if (!path_is_absolute(env) || !path_is_normalized(env)) + return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), + "$MEMORY_PRESSURE_WATCH set to invalid path: %s", env); + + watch = env; + + env = secure_getenv("MEMORY_PRESSURE_WRITE"); + if (env) { + r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size); + if (r < 0) + return r; + } + + locked = true; + } else { + + r = is_pressure_supported(); + if (r < 0) + return r; + if (r == 0) + return -EOPNOTSUPP; + + /* By default we want to watch memory pressure on the local cgroup, but we'll fall back on + * the system wide pressure if for some reason we cannot (which could be: memory controller + * not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll + * only use the system-wide logic. */ + r = cg_all_unified(); + if (r < 0) + return r; + if (r == 0) + watch = "/proc/pressure/memory"; + else { + _cleanup_free_ char *cg = NULL; + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg); + if (r < 0) + return r; + + w = path_join("/sys/fs/cgroup", cg, "memory.pressure"); + if (!w) + return -ENOMEM; + + watch = w; + watch_fallback = "/proc/pressure/memory"; + } + + /* Android uses three levels in its userspace low memory killer logic: + * some 70000 1000000 + * some 100000 1000000 + * full 70000 1000000 + * + * GNOME's low memory monitor uses: + * some 70000 1000000 + * some 100000 1000000 + * full 100000 1000000 + * + * We'll default to the middle level that both agree on */ + if (asprintf((char**) &write_buffer, + "%s " USEC_FMT " " USEC_FMT, + MEMORY_PRESSURE_DEFAULT_TYPE, + MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC, + MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0) + return -ENOMEM; + + write_buffer_size = strlen(write_buffer) + 1; + locked = false; + } + + path_fd = open(watch, O_PATH|O_CLOEXEC); + if (path_fd < 0) { + if (errno != ENOENT) + return -errno; + + /* We got ENOENT. Three options now: try the fallback if we have one, or return the error as + * is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and + * the PSI service apparently is not supported) */ + if (!watch_fallback) + return locked ? -ENOENT : -EOPNOTSUPP; + + path_fd = open(watch_fallback, O_PATH|O_CLOEXEC); + if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */ + return -EOPNOTSUPP; + if (errno < 0) + return -errno; + } + + if (fstat(path_fd, &st) < 0) + return -errno; + + if (S_ISSOCK(st.st_mode)) { + fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0); + if (fd < 0) + return -errno; + + r = connect_unix_path(fd, path_fd, NULL); + if (r < 0) + return r; + + events = EPOLLIN; + + } else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) { + fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY); + if (fd < 0) + return fd; + + if (S_ISREG(st.st_mode)) { + struct statfs sfs; + + /* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */ + + if (fstatfs(fd, &sfs) < 0) + return -errno; + + if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) && + !is_fs_type(&sfs, CGROUP2_SUPER_MAGIC)) + return -ENOTTY; + + events = EPOLLPRI; + } else + /* For fifos and char devices just watch for EPOLLIN */ + events = EPOLLIN; + + } else if (S_ISDIR(st.st_mode)) + return -EISDIR; + else + return -EBADF; + + s->memory_pressure.fd = TAKE_FD(fd); + s->memory_pressure.write_buffer = TAKE_PTR(write_buffer); + s->memory_pressure.write_buffer_size = write_buffer_size; + s->memory_pressure.events = events; + s->memory_pressure.locked = locked; + + /* So here's the thing: if we are talking to PSI we need to write the watch string before adding the + * fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the + * fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure + * event sources on which writes must be executed before the first event loop iteration is + * executed. (We could also write the data here, right away, but we want to give the caller the + * freedom to call sd_event_source_set_memory_pressure_type() and + * sd_event_source_set_memory_pressure_rate() before we write it. */ + + if (s->memory_pressure.write_buffer_size > 0) + source_memory_pressure_add_to_write_list(s); + else { + r = source_memory_pressure_register(s, s->enabled); + if (r < 0) + return r; + } + + if (ret) + *ret = s; + TAKE_PTR(s); + + return 0; +} + static void event_free_inotify_data(sd_event *e, struct inotify_data *d) { assert(e); @@ -2562,6 +2896,10 @@ static int event_source_offline( prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index); break; + case SOURCE_MEMORY_PRESSURE: + source_memory_pressure_unregister(s); + break; + case SOURCE_TIME_REALTIME: case SOURCE_TIME_BOOTTIME: case SOURCE_TIME_MONOTONIC: @@ -2649,6 +2987,13 @@ static int event_source_online( s->event->n_online_child_sources++; break; + case SOURCE_MEMORY_PRESSURE: + r = source_memory_pressure_register(s, enabled); + if (r < 0) + return r; + + break; + case SOURCE_TIME_REALTIME: case SOURCE_TIME_BOOTTIME: case SOURCE_TIME_MONOTONIC: @@ -3630,6 +3975,106 @@ static int process_inotify(sd_event *e) { return done; } +static int process_memory_pressure(sd_event_source *s, uint32_t revents) { + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + if (s->pending) + s->memory_pressure.revents |= revents; + else + s->memory_pressure.revents = revents; + + return source_set_pending(s, true); +} + +static int source_memory_pressure_write(sd_event_source *s) { + ssize_t n; + int r; + + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + /* once we start writing, the buffer is locked, we allow no further changes. */ + s->memory_pressure.locked = true; + + if (s->memory_pressure.write_buffer_size > 0) { + n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size); + if (n < 0) { + if (!ERRNO_IS_TRANSIENT(errno)) + return -errno; + + n = 0; + } + } else + n = 0; + + assert(n >= 0); + + if ((size_t) n == s->memory_pressure.write_buffer_size) { + s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer); + + if (n > 0) { + s->memory_pressure.write_buffer_size = 0; + + /* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */ + r = source_memory_pressure_register(s, s->enabled); + if (r < 0) + return r; + } + } else if (n > 0) { + _cleanup_free_ void *c = NULL; + + assert((size_t) n < s->memory_pressure.write_buffer_size); + + c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n); + if (!c) + return -ENOMEM; + + free_and_replace(s->memory_pressure.write_buffer, c); + s->memory_pressure.write_buffer_size -= n; + return 1; + } + + return 0; +} + +static int source_memory_pressure_initiate_dispatch(sd_event_source *s) { + int r; + + assert(s); + assert(s->type == SOURCE_MEMORY_PRESSURE); + + r = source_memory_pressure_write(s); + if (r < 0) + return r; + if (r > 0) + return 1; /* if we wrote something, then don't continue with dispatching user dispatch + * function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */ + + /* No pending incoming IO? Then let's not continue further */ + if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) { + + /* Treat IO errors on the notifier the same ways errors returned from a callback */ + if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0) + return -EIO; + + return 1; /* leave dispatch, we already processed everything */ + } + + if (s->memory_pressure.revents & EPOLLIN) { + uint8_t pipe_buf[PIPE_BUF]; + ssize_t n; + + /* If the fd is readable, then flush out anything that might be queued */ + + n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf)); + if (n < 0 && !ERRNO_IS_TRANSIENT(errno)) + return -errno; + } + + return 0; /* go on, dispatch to user callback */ +} + static int source_dispatch(sd_event_source *s) { EventSourceType saved_type; sd_event *saved_event; @@ -3678,6 +4123,16 @@ static int source_dispatch(sd_event_source *s) { } } + if (s->type == SOURCE_MEMORY_PRESSURE) { + r = source_memory_pressure_initiate_dispatch(s); + if (r == -EIO) /* handle EIO errors similar to callback errors */ + goto finish; + if (r < 0) + return r; + if (r > 0) /* already handled */ + return 1; + } + if (s->enabled == SD_EVENT_ONESHOT) { r = sd_event_source_set_enabled(s, SD_EVENT_OFF); if (r < 0) @@ -3764,6 +4219,10 @@ static int source_dispatch(sd_event_source *s) { break; } + case SOURCE_MEMORY_PRESSURE: + r = s->memory_pressure.callback(s, s->userdata); + break; + case SOURCE_WATCHDOG: case _SOURCE_EVENT_SOURCE_TYPE_MAX: case _SOURCE_EVENT_SOURCE_TYPE_INVALID: @@ -3772,6 +4231,7 @@ static int source_dispatch(sd_event_source *s) { s->dispatching = false; +finish: if (r < 0) { log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m", strna(s->description), @@ -3922,6 +4382,30 @@ static void event_close_inode_data_fds(sd_event *e) { } } +static int event_memory_pressure_write_list(sd_event *e) { + int r; + + assert(e); + + for (;;) { + sd_event_source *s; + + s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list); + if (!s) + break; + + assert(s->type == SOURCE_MEMORY_PRESSURE); + assert(s->memory_pressure.write_buffer_size > 0); + s->memory_pressure.in_write_list = false; + + r = source_memory_pressure_write(s); + if (r < 0) + return r; + } + + return 0; +} + _public_ int sd_event_prepare(sd_event *e) { int r; @@ -3950,6 +4434,10 @@ _public_ int sd_event_prepare(sd_event *e) { if (r < 0) return r; + r = event_memory_pressure_write_list(e); + if (r < 0) + return r; + r = event_arm_timer(e, &e->realtime); if (r < 0) return r; @@ -4115,6 +4603,10 @@ static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t r = process_pidfd(e, s, e->event_queue[i].events); break; + case SOURCE_MEMORY_PRESSURE: + r = process_memory_pressure(s, e->event_queue[i].events); + break; + default: assert_not_reached(); } @@ -4700,3 +5192,91 @@ _public_ int sd_event_set_signal_exit(sd_event *e, int b) { return change; } + +_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) { + _cleanup_free_ char *b = NULL; + _cleanup_free_ void *w = NULL; + + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + assert_return(ty, -EINVAL); + + if (!STR_IN_SET(ty, "some", "full")) + return -EINVAL; + + if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ + return -EBUSY; + + char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size); + if (!space) + return -EINVAL; + + size_t l = (char*) space - (char*) s->memory_pressure.write_buffer; + b = memdup_suffix0(s->memory_pressure.write_buffer, l); + if (!b) + return -ENOMEM; + if (!STR_IN_SET(b, "some", "full")) + return -EINVAL; + + if (streq(b, ty)) + return 0; + + size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l); + w = new(char, nl); + if (!w) + return -ENOMEM; + + memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l)); + + free_and_replace(s->memory_pressure.write_buffer, w); + s->memory_pressure.write_buffer_size = nl; + s->memory_pressure.locked = false; + + return 1; +} + +_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) { + _cleanup_free_ char *b = NULL; + _cleanup_free_ void *w = NULL; + + assert_return(s, -EINVAL); + assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM); + + if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX) + return -ERANGE; + if (window_usec <= 0 || window_usec >= UINT64_MAX) + return -ERANGE; + if (threshold_usec > window_usec) + return -EINVAL; + + if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */ + return -EBUSY; + + char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size); + if (!space) + return -EINVAL; + + size_t l = (char*) space - (char*) s->memory_pressure.write_buffer; + b = memdup_suffix0(s->memory_pressure.write_buffer, l); + if (!b) + return -ENOMEM; + if (!STR_IN_SET(b, "some", "full")) + return -EINVAL; + + if (asprintf((char**) &w, + "%s " USEC_FMT " " USEC_FMT "", + b, + threshold_usec, + window_usec) < 0) + return -EINVAL; + + l = strlen(w) + 1; + if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0) + return 0; + + free_and_replace(s->memory_pressure.write_buffer, w); + s->memory_pressure.write_buffer_size = l; + s->memory_pressure.locked = false; + + return 1; +} diff --git a/src/systemd/sd-event.h b/src/systemd/sd-event.h index cae4c8672a7..5ca8528895e 100644 --- a/src/systemd/sd-event.h +++ b/src/systemd/sd-event.h @@ -99,6 +99,7 @@ int sd_event_add_inotify_fd(sd_event *e, sd_event_source **s, int fd, uint32_t m int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); int sd_event_add_exit(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); +int sd_event_add_memory_pressure(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata); int sd_event_prepare(sd_event *e); int sd_event_wait(sd_event *e, uint64_t usec); @@ -160,6 +161,8 @@ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo int sd_event_source_send_child_signal(sd_event_source *s, int sig, const void *si, unsigned flags); #endif int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret); +int sd_event_source_set_memory_pressure_type(sd_event_source *e, const char *ty); +int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec); int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback); int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret); int sd_event_source_get_floating(sd_event_source *s); @@ -171,6 +174,8 @@ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval_use int sd_event_source_is_ratelimited(sd_event_source *s); int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback); +int sd_event_trim_memory(void); + /* Define helpers so that __attribute__((cleanup(sd_event_unrefp))) and similar may be used. */ _SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event, sd_event_unref); _SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event_source, sd_event_source_unref); diff --git a/src/systemd/sd-messages.h b/src/systemd/sd-messages.h index 00fdbad2c56..39f56240fe9 100644 --- a/src/systemd/sd-messages.h +++ b/src/systemd/sd-messages.h @@ -195,6 +195,9 @@ _SD_BEGIN_DECLARATIONS; #define SD_MESSAGE_TPM_PCR_EXTEND SD_ID128_MAKE(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab) #define SD_MESSAGE_TPM_PCR_EXTEND_STR SD_ID128_MAKE_STR(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab) +#define SD_MESSAGE_MEMORY_TRIM SD_ID128_MAKE(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21) +#define SD_MESSAGE_MEMORY_TRIM_STR SD_ID128_MAKE_STR(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21) + _SD_END_DECLARATIONS; #endif From b7dc40e66317b0d389537d7e3d1cde7d84cac01a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 10 Feb 2023 13:29:58 +0100 Subject: [PATCH 2/3] test: add test for new memory pressure logic it tests both real PSI stuff (if available) and fake pressure via AF_UNIX and FIFO notification. --- src/test/meson.build | 4 + src/test/test-mempress.c | 309 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 313 insertions(+) create mode 100644 src/test/test-mempress.c diff --git a/src/test/meson.build b/src/test/meson.build index e3c0e67dcfb..dc9e95a7bdd 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -386,6 +386,10 @@ tests += [ 'sources' : files('test-math-util.c'), 'dependencies' : libm, }, + { + 'sources' : files('test-mempress.c'), + 'dependencies' : threads, + }, { 'sources' : files('test-namespace.c'), 'dependencies' : [ diff --git a/src/test/test-mempress.c b/src/test/test-mempress.c new file mode 100644 index 00000000000..3371965ac33 --- /dev/null +++ b/src/test/test-mempress.c @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: LGPL-2.1-or-later */ + +#include +#include +#include +#include + +#include +#include + +#include "bus-locator.h" +#include "bus-wait-for-jobs.h" +#include "fd-util.h" +#include "path-util.h" +#include "process-util.h" +#include "random-util.h" +#include "rm-rf.h" +#include "signal-util.h" +#include "socket-util.h" +#include "tests.h" +#include "tmpfile-util.h" +#include "unit-def.h" + +struct fake_pressure_context { + int fifo_fd; + int socket_fd; +}; + +static void *fake_pressure_thread(void *p) { + _cleanup_free_ struct fake_pressure_context *c = ASSERT_PTR(p); + _cleanup_close_ int cfd = -1; + + usleep(150); + + assert_se(write(c->fifo_fd, &(const char) { 'x' }, 1) == 1); + + usleep(150); + + cfd = accept4(c->socket_fd, NULL, NULL, SOCK_CLOEXEC); + assert_se(cfd >= 0); + char buf[STRLEN("hello")+1] = {}; + assert_se(read(cfd, buf, sizeof(buf)-1) == sizeof(buf)-1); + assert_se(streq(buf, "hello")); + assert_se(write(cfd, &(const char) { 'z' }, 1) == 1); + + return 0; +} + +static int fake_pressure_callback(sd_event_source *s, void *userdata) { + int *value = userdata; + const char *d; + + assert_se(s); + assert_se(sd_event_source_get_description(s, &d) >= 0); + + *value *= d[0]; + + log_notice("memory pressure event: %s", d); + + if (*value == 7 * 'f' * 's') + assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0); + + return 0; +} + +TEST(fake_pressure) { + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *ef = NULL; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *j = NULL, *k = NULL; + _cleanup_(rm_rf_physical_and_freep) char *tmp = NULL; + _cleanup_close_ int fifo_fd = -1, socket_fd = -1; + union sockaddr_union sa; + pthread_t th; + int value = 7; + + assert_se(sd_event_default(&e) >= 0); + + assert_se(mkdtemp_malloc(NULL, &tmp) >= 0); + + assert_se(j = path_join(tmp, "fifo")); + assert_se(mkfifo(j, 0600) >= 0); + fifo_fd = open(j, O_CLOEXEC|O_RDWR|O_NONBLOCK); + assert_se(fifo_fd >= 0); + + assert_se(k = path_join(tmp, "sock")); + socket_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0); + assert_se(socket_fd >= 0); + assert_se(sockaddr_un_set_path(&sa.un, k) >= 0); + assert_se(bind(socket_fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) >= 0); + assert_se(listen(socket_fd, 1) >= 0); + + /* Ideally we'd just allocate this on the stack, but AddressSanitizer doesn't like it if threads + * access each other's stack */ + struct fake_pressure_context *fp = new(struct fake_pressure_context, 1); + assert_se(fp); + *fp = (struct fake_pressure_context) { + .fifo_fd = fifo_fd, + .socket_fd = socket_fd, + }; + + assert_se(pthread_create(&th, NULL, fake_pressure_thread, TAKE_PTR(fp)) == 0); + + assert_se(setenv("MEMORY_PRESSURE_WATCH", j, /* override= */ true) >= 0); + assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0); + + assert_se(sd_event_add_memory_pressure(e, &es, fake_pressure_callback, &value) >= 0); + assert_se(sd_event_source_set_description(es, "fifo event source") >= 0); + + assert_se(setenv("MEMORY_PRESSURE_WATCH", k, /* override= */ true) >= 0); + assert_se(setenv("MEMORY_PRESSURE_WRITE", "aGVsbG8K", /* override= */ true) >= 0); + + assert_se(sd_event_add_memory_pressure(e, &ef, fake_pressure_callback, &value) >= 0); + assert_se(sd_event_source_set_description(ef, "socket event source") >= 0); + + assert_se(sd_event_loop(e) >= 0); + + assert_se(value == 7 * 'f' * 's'); + + assert_se(pthread_join(th, NULL) == 0); +} + +struct real_pressure_context { + sd_event_source *pid; +}; + +static int real_pressure_callback(sd_event_source *s, void *userdata) { + struct real_pressure_context *c = ASSERT_PTR(userdata); + const char *d; + + assert_se(s); + assert_se(sd_event_source_get_description(s, &d) >= 0); + + log_notice("real_memory pressure event: %s", d); + + sd_event_trim_memory(); + + assert_se(c->pid); + assert_se(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0) >= 0); + c->pid = NULL; + + return 0; +} + +#define MMAP_SIZE (10 * 1024 * 1024) + +_noreturn_ static void real_pressure_eat_memory(int pipe_fd) { + size_t ate = 0; + + /* Allocates and touches 10M at a time, until runs out of memory */ + + char x; + assert_se(read(pipe_fd, &x, 1) == 1); /* Wait for the GO! */ + + for (;;) { + void *p; + + p = mmap(NULL, MMAP_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + assert_se(p != MAP_FAILED); + + log_info("Eating another %s.", FORMAT_BYTES(MMAP_SIZE)); + + memset(p, random_u32() & 0xFF, MMAP_SIZE); + ate += MMAP_SIZE; + + log_info("Ate %s in total.", FORMAT_BYTES(ate)); + + usleep(50 * USEC_PER_MSEC); + } +} + +static int real_pressure_child_callback(sd_event_source *s, const siginfo_t *si, void *userdata) { + assert_se(s); + assert_se(si); + + log_notice("child dead"); + + assert_se(si->si_signo == SIGCHLD); + assert_se(si->si_status == SIGKILL); + assert_se(si->si_code == CLD_KILLED); + + assert_se(sd_event_exit(sd_event_source_get_event(s), 31) >= 0); + return 0; +} + +TEST(real_pressure) { + _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; + _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; + _cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL; + _cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL; + _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL; + _cleanup_close_pair_ int pipe_fd[2] = PIPE_EBADF; + _cleanup_(sd_event_unrefp) sd_event *e = NULL; + _cleanup_free_ char *scope = NULL; + const char *object; + int r; + pid_t pid; + + r = sd_bus_open_system(&bus); + if (r < 0) { + log_notice_errno(r, "Can't connect to system bus, skipping test: %m"); + return; + } + + assert_se(bus_wait_for_jobs_new(bus, &w) >= 0); + + assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit") >= 0); + assert_se(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64()) >= 0); + assert_se(sd_bus_message_append(m, "ss", scope, "fail") >= 0); + assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0) >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "MemoryAccounting", "b", true) >= 0); + assert_se(sd_bus_message_close_container(m) >= 0); + assert_se(sd_bus_message_append(m, "a(sa(sv))", 0) >= 0); + + r = sd_bus_call(bus, m, 0, &error, &reply); + if (r < 0) { + log_notice_errno(r, "Can't issue transient unit call, skipping test: %m"); + return; + } + + assert_se(sd_bus_message_read(reply, "o", &object) >= 0); + + assert_se(bus_wait_for_jobs_one(w, object, /* quiet= */ false, /* extra_args= */ NULL) >= 0); + + assert_se(sd_event_default(&e) >= 0); + + assert_se(pipe2(pipe_fd, O_CLOEXEC) >= 0); + + r = safe_fork("(eat-memory)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid); + assert_se(r >= 0); + if (r == 0) { + real_pressure_eat_memory(pipe_fd[0]); + _exit(EXIT_SUCCESS); + } + + assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0); + assert_se(sd_event_add_child(e, &cs, pid, WEXITED, real_pressure_child_callback, NULL) >= 0); + assert_se(sd_event_source_set_child_process_own(cs, true) >= 0); + + assert_se(unsetenv("MEMORY_PRESSURE_WATCH") >= 0); + assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0); + + struct real_pressure_context context = { + .pid = cs, + }; + + r = sd_event_add_memory_pressure(e, &es, real_pressure_callback, &context); + if (r < 0) { + log_notice_errno(r, "Can't allocate memory pressure fd, skipping test: %m"); + return; + } + + assert_se(sd_event_source_set_description(es, "real pressure event source") >= 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "full") > 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "full") == 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "some") > 0); + assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0); + assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) > 0); + assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) == 0); + assert_se(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT) >= 0); + + _cleanup_free_ char *uo; + assert_se(uo = unit_dbus_path_from_name(scope)); + + uint64_t mcurrent = UINT64_MAX; + assert_se(sd_bus_get_property_trivial(bus, "org.freedesktop.systemd1", uo, "org.freedesktop.systemd1.Scope", "MemoryCurrent", &error, 't', &mcurrent) >= 0); + + printf("current: %" PRIu64 "\n", mcurrent); + if (mcurrent == UINT64_MAX) { + log_notice_errno(r, "Memory accounting not available, skipping test: %m"); + return; + } + + m = sd_bus_message_unref(m); + + assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties") >= 0); + assert_se(sd_bus_message_append(m, "sb", scope, true) >= 0); + assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "MemoryHigh", "t", mcurrent + (15 * 1024 * 1024)) >= 0); + assert_se(sd_bus_message_append(m, "(sv)", "MemoryMax", "t", mcurrent + (50 * 1024 * 1024)) >= 0); + assert_se(sd_bus_message_close_container(m) >= 0); + + assert_se(sd_bus_call(bus, m, 0, NULL, NULL) >= 0); + + /* Generate some memory allocations via mempool */ +#define NN (1024) + Hashmap **h = new(Hashmap*, NN); + for (int i = 0; i < NN; i++) + h[i] = hashmap_new(NULL); + for (int i = 0; i < NN; i++) + hashmap_free(h[i]); + free(h); + + /* Now start eating memory */ + assert_se(write(pipe_fd[1], &(const char) { 'x' }, 1) == 1); + + assert_se(sd_event_loop(e) >= 0); + int ex = 0; + assert_se(sd_event_get_exit_code(e, &ex) >= 0); + assert_se(ex == 31); +} + +static int outro(void) { + hashmap_trim_pools(); + return 0; +} + +DEFINE_TEST_MAIN_FULL(LOG_DEBUG, NULL, outro); From f8a32e679eec7173db5f7ccfb63a2e3841ded1e1 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 16 Feb 2023 17:24:28 +0100 Subject: [PATCH 3/3] man: document the new sd_event_add_memory_pressure() API --- man/rules/meson.build | 6 + man/sd_event_add_memory_pressure.xml | 270 +++++++++++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 man/sd_event_add_memory_pressure.xml diff --git a/man/rules/meson.build b/man/rules/meson.build index dcae4442eaa..1279004161b 100644 --- a/man/rules/meson.build +++ b/man/rules/meson.build @@ -555,6 +555,12 @@ manpages = [ 'sd_event_source_set_io_fd', 'sd_event_source_set_io_fd_own'], ''], + ['sd_event_add_memory_pressure', + '3', + ['sd_event_source_set_memory_pressure_period', + 'sd_event_source_set_memory_pressure_type', + 'sd_event_trim_memory'], + ''], ['sd_event_add_signal', '3', ['SD_EVENT_SIGNAL_PROCMASK', diff --git a/man/sd_event_add_memory_pressure.xml b/man/sd_event_add_memory_pressure.xml new file mode 100644 index 00000000000..46cae0d16db --- /dev/null +++ b/man/sd_event_add_memory_pressure.xml @@ -0,0 +1,270 @@ + + + + + + + + sd_event_add_memory_pressure + systemd + + + + sd_event_add_memory_pressure + 3 + + + + sd_event_add_memory_pressure + sd_event_source_set_memory_pressure_type + sd_event_source_set_memory_pressure_period + sd_event_trim_memory + + Add and configure an event source run as result of memory pressure + + + + + #include <systemd/sd-event.h> + + typedef struct sd_event_source sd_event_source; + + + int sd_event_add_memory_pressure + sd_event *event + sd_event_source **ret_source + sd_event_handler_t handler + void *userdata + + + + int sd_event_source_set_memory_pressure_type + sd_event_source *source + const char *type + + + + int sd_event_source_set_memory_pressure_period + sd_event_source *source + uint64_t threshold_usec + uint64_t window_usec + + + + int sd_event_trim_memory + void + + + + + + Description + + sd_event_add_memory_pressure() adds a new event source that is triggered + whenever memory pressure is seen. This functionality is built around the Linux kernel's Pressure Stall Information (PSI) logic. + + Expects an event loop object as first parameter, and returns the allocated event source object in + the second parameter, on success. The handler parameter is a function to call when + memory pressure is seen, or NULL. The handler function will be passed the + userdata pointer, which may be chosen freely by the caller. The handler may return + negative to signal an error (see below), other return values are ignored. If + handler is NULL, a default handler that compacts allocation + caches maintained by libsystemd as well as glibc (via malloc_trim3) + will be used. + + To destroy an event source object use + sd_event_source_unref3, + but note that the event source is only removed from the event loop when all references to the event + source are dropped. To make sure an event source does not fire anymore, even if it is still referenced, + disable the event source using + sd_event_source_set_enabled3 + with SD_EVENT_OFF. + + If the second parameter of sd_event_add_memory_pressure() is + NULL no reference to the event source object is returned. In this case the event + source is considered "floating", and will be destroyed implicitly when the event loop itself is + destroyed. + + The event source will fire according to the following logic: + + + If the + $MEMORY_PRESSURE_WATCH/$MEMORY_PRESSURE_WRITE environment + variables are set at the time the event source is established, it will watch the file, FIFO or AF_UNIX + socket specified via $MEMORY_PRESSURE_WATCH (which must contain an absolute path + name) for POLLPRI (in case it is a regular file) or POLLIN + events (otherwise). After opening the inode, it will write the (decoded) Base64 data provided via + $MEMORY_PRESSURE_WRITE into it before it starts polling on it (the variable may be + unset in which case this is skipped). Typically, if used, $MEMORY_PRESSURE_WATCH + will contain a path such as /proc/pressure/memory or a path to a specific + memory.pressure file in the control group file system + (cgroupfs). + + If these environment variables are not set, the local PSI interface file + memory.pressure of the control group the invoking process is running in is + used. + + If that file does not exist, the system-wide PSI interface file + /proc/pressure/memory is watched instead. + + + Or in other words: preferably any explicit configuration passed in by an invoking service manager + (or similar) is used as notification source, before falling back to local notifications of the service, + and finally to global notifications of the system. + + Well-behaving services and applications are recommended to react to memory pressure events by + executing one or more of the following operations, in order to ensure optimal behaviour even on loaded + and resource-constrained systems: + + + Release allocation caches such as malloc_trim() or similar, both + implemented in the libraries consumed by the program and in private allocation caches of the program + itself. + + Release any other form of in-memory caches that can easily be recovered if + needed (e.g. browser caches). + + Terminate idle worker threads or processes, or similar. + + Even exit entirely from the program if it is idle and can be automatically started when + needed (for example via socket or bus activation). + + + Any of the suggested operations should help easing memory pressure situations and allowing the + system to make progress by reclaiming the memory for other purposes. + + This event source typically fires on memory pressure stalls, i.e. when operational latency above a + configured threshold already has been seen. This should be taken into consideration when discussing + whether later latency to re-aquire any released resources is acceptable: it's usually more important to + think of the latencies that already happened than those coming up in future. + + The sd_event_source_set_memory_pressure_type() and + sd_event_source_set_memory_pressure_period() functions can be used to fine-tune the + PSI parameters for pressure notifications. The former takes either some, + full as second parameter, the latter takes threshold and period times in microseconds + as parameters. For details about these three parameters see the PSI documentation. Note that these two + calls must be invoked immediately after allocating the event source, as they must be configured before + polling begins. Also note that these calls will fail if memory pressure paramterization has been passed + in via the $MEMORY_PRESSURE_WATCH/$MEMORY_PRESSURE_WRITE + environment variables (or in other words: configuration supplied by a service manager wins over internal + settings). + + The sd_event_trim_memory() function releases various internal allocation + caches maintained by libsystemd and then invokes glibc's malloc_trim3. This + makes the operation executed when the handler function parameter of + sd_event_add_memory_pressure is passed as NULL directly + accessible for invocation at any time (see above). This function will log a structured log message at + LOG_DEBUG level (with message ID f9b0be465ad540d0850ad32172d57c21) about the memory + pressure operation. + + + + Return Value + + On success, these functions return 0 or a positive + integer. On failure, they return a negative errno-style error + code. + + + Errors + + Returned errors may indicate the following problems: + + + + -ENOMEM + + Not enough memory to allocate an object. + + + + -EINVAL + + An invalid argument has been passed. + + + + -EHOSTDOWN + + The $MEMORY_PRESSURE_WATCH variable has been set to the literal + string /dev/null, in order to explicitly disable memory pressure + handling. + + + + -EBADMSG + + The $MEMORY_PRESSURE_WATCH variable has been set to an invalid + string, for example a relative rather than an absolute path. + + + + -ENOTTY + + The $MEMORY_PRESSURE_WATCH variable points to a regular file + outside of the procfs or cgroupfs file systems. + + + + -EOPNOTSUPP + + No configuration via $MEMORY_PRESSURE_WATCH has been specified + and the local kernel does not support the PSI interface. + + + + -EBUSY + + This is returned by sd_event_source_set_memory_pressure_type() + and sd_event_source_set_memory_pressure_period() if invoked on event sources + at a time later than immediately after allocting them. + + + + -ESTALE + + The event loop is already terminated. + + + + -ECHILD + + The event loop has been created in a different process. + + + + -EDOM + + The passed event source is not a signal event source. + + + + + + + + + + See Also + + + systemd1, + sd-event3, + sd_event_new3, + sd_event_add_io3, + sd_event_add_time3, + sd_event_add_child3, + sd_event_add_inotify3, + sd_event_add_defer3, + sd_event_source_set_enabled3, + sd_event_source_set_description3, + sd_event_source_set_userdata3, + sd_event_source_set_floating3 + + + +