mirror of
https://github.com/systemd/systemd.git
synced 2024-12-27 07:22:31 +03:00
Merge pull request #26448 from poettering/sd-event-mempress
sd-event: add memory pressure event source
This commit is contained in:
commit
921330af40
@ -549,3 +549,18 @@ Whenever the system transitions to a new runtime phase, the specified PCR is
|
||||
extended with a different string, to ensure that security policies for
|
||||
TPM-bound secrets and other resources are limited to specific phases of the
|
||||
runtime.
|
||||
|
||||
-- f9b0be465ad540d0850ad32172d57c21
|
||||
Subject: Memory Trimmed
|
||||
Defined-By: systemd
|
||||
Support: %SUPPORT_URL%
|
||||
|
||||
Memory of process @_PID@ (@_COMM@) has been trimmed.
|
||||
|
||||
Either on user request or as result of a memory pressure event, memory of the
|
||||
process has been trimmed, returning unneded allocation caches and other
|
||||
resources back to the OS kernel, making them available for other components of
|
||||
the OS.
|
||||
|
||||
@TRIMMED_BYTES@ of memory were returned to the OS, which took @TRIMMED_USEC@
|
||||
micro-seconds (µs).
|
||||
|
@ -555,6 +555,12 @@ manpages = [
|
||||
'sd_event_source_set_io_fd',
|
||||
'sd_event_source_set_io_fd_own'],
|
||||
''],
|
||||
['sd_event_add_memory_pressure',
|
||||
'3',
|
||||
['sd_event_source_set_memory_pressure_period',
|
||||
'sd_event_source_set_memory_pressure_type',
|
||||
'sd_event_trim_memory'],
|
||||
''],
|
||||
['sd_event_add_signal',
|
||||
'3',
|
||||
['SD_EVENT_SIGNAL_PROCMASK',
|
||||
|
270
man/sd_event_add_memory_pressure.xml
Normal file
270
man/sd_event_add_memory_pressure.xml
Normal file
@ -0,0 +1,270 @@
|
||||
<?xml version='1.0'?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
|
||||
"http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
|
||||
<!-- SPDX-License-Identifier: LGPL-2.1-or-later -->
|
||||
|
||||
<refentry id="sd_event_add_memory_pressure" xmlns:xi="http://www.w3.org/2001/XInclude">
|
||||
|
||||
<refentryinfo>
|
||||
<title>sd_event_add_memory_pressure</title>
|
||||
<productname>systemd</productname>
|
||||
</refentryinfo>
|
||||
|
||||
<refmeta>
|
||||
<refentrytitle>sd_event_add_memory_pressure</refentrytitle>
|
||||
<manvolnum>3</manvolnum>
|
||||
</refmeta>
|
||||
|
||||
<refnamediv>
|
||||
<refname>sd_event_add_memory_pressure</refname>
|
||||
<refname>sd_event_source_set_memory_pressure_type</refname>
|
||||
<refname>sd_event_source_set_memory_pressure_period</refname>
|
||||
<refname>sd_event_trim_memory</refname>
|
||||
|
||||
<refpurpose>Add and configure an event source run as result of memory pressure</refpurpose>
|
||||
</refnamediv>
|
||||
|
||||
<refsynopsisdiv>
|
||||
<funcsynopsis>
|
||||
<funcsynopsisinfo>#include <systemd/sd-event.h></funcsynopsisinfo>
|
||||
|
||||
<funcsynopsisinfo><token>typedef</token> struct sd_event_source sd_event_source;</funcsynopsisinfo>
|
||||
|
||||
<funcprototype>
|
||||
<funcdef>int <function>sd_event_add_memory_pressure</function></funcdef>
|
||||
<paramdef>sd_event *<parameter>event</parameter></paramdef>
|
||||
<paramdef>sd_event_source **<parameter>ret_source</parameter></paramdef>
|
||||
<paramdef>sd_event_handler_t <parameter>handler</parameter></paramdef>
|
||||
<paramdef>void *<parameter>userdata</parameter></paramdef>
|
||||
</funcprototype>
|
||||
|
||||
<funcprototype>
|
||||
<funcdef>int <function>sd_event_source_set_memory_pressure_type</function></funcdef>
|
||||
<paramdef>sd_event_source *<parameter>source</parameter></paramdef>
|
||||
<paramdef>const char *<parameter>type</parameter></paramdef>
|
||||
</funcprototype>
|
||||
|
||||
<funcprototype>
|
||||
<funcdef>int <function>sd_event_source_set_memory_pressure_period</function></funcdef>
|
||||
<paramdef>sd_event_source *<parameter>source</parameter></paramdef>
|
||||
<paramdef>uint64_t <parameter>threshold_usec</parameter></paramdef>
|
||||
<paramdef>uint64_t <parameter>window_usec</parameter></paramdef>
|
||||
</funcprototype>
|
||||
|
||||
<funcprototype>
|
||||
<funcdef>int <function>sd_event_trim_memory</function></funcdef>
|
||||
<paramdef>void</paramdef>
|
||||
</funcprototype>
|
||||
</funcsynopsis>
|
||||
</refsynopsisdiv>
|
||||
|
||||
<refsect1>
|
||||
<title>Description</title>
|
||||
|
||||
<para><function>sd_event_add_memory_pressure()</function> adds a new event source that is triggered
|
||||
whenever memory pressure is seen. This functionality is built around the Linux kernel's <ulink
|
||||
url="https://docs.kernel.org/accounting/psi.html">Pressure Stall Information (PSI)</ulink> logic.</para>
|
||||
|
||||
<para>Expects an event loop object as first parameter, and returns the allocated event source object in
|
||||
the second parameter, on success. The <parameter>handler</parameter> parameter is a function to call when
|
||||
memory pressure is seen, or <constant>NULL</constant>. The handler function will be passed the
|
||||
<parameter>userdata</parameter> pointer, which may be chosen freely by the caller. The handler may return
|
||||
negative to signal an error (see below), other return values are ignored. If
|
||||
<parameter>handler</parameter> is <constant>NULL</constant>, a default handler that compacts allocation
|
||||
caches maintained by <filename>libsystemd</filename> as well as glibc (via <citerefentry
|
||||
project='man-pages'><refentrytitle>malloc_trim</refentrytitle><manvolnum>3</manvolnum></citerefentry>)
|
||||
will be used.</para>
|
||||
|
||||
<para>To destroy an event source object use
|
||||
<citerefentry><refentrytitle>sd_event_source_unref</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
but note that the event source is only removed from the event loop when all references to the event
|
||||
source are dropped. To make sure an event source does not fire anymore, even if it is still referenced,
|
||||
disable the event source using
|
||||
<citerefentry><refentrytitle>sd_event_source_set_enabled</refentrytitle><manvolnum>3</manvolnum></citerefentry>
|
||||
with <constant>SD_EVENT_OFF</constant>.</para>
|
||||
|
||||
<para>If the second parameter of <function>sd_event_add_memory_pressure()</function> is
|
||||
<constant>NULL</constant> no reference to the event source object is returned. In this case the event
|
||||
source is considered "floating", and will be destroyed implicitly when the event loop itself is
|
||||
destroyed.</para>
|
||||
|
||||
<para>The event source will fire according to the following logic:</para>
|
||||
|
||||
<orderedlist>
|
||||
<listitem><para>If the
|
||||
<varname>$MEMORY_PRESSURE_WATCH</varname>/<varname>$MEMORY_PRESSURE_WRITE</varname> environment
|
||||
variables are set at the time the event source is established, it will watch the file, FIFO or AF_UNIX
|
||||
socket specified via <varname>$MEMORY_PRESSURE_WATCH</varname> (which must contain an absolute path
|
||||
name) for <constant>POLLPRI</constant> (in case it is a regular file) or <constant>POLLIN</constant>
|
||||
events (otherwise). After opening the inode, it will write the (decoded) Base64 data provided via
|
||||
<varname>$MEMORY_PRESSURE_WRITE</varname> into it before it starts polling on it (the variable may be
|
||||
unset in which case this is skipped). Typically, if used, <varname>$MEMORY_PRESSURE_WATCH</varname>
|
||||
will contain a path such as <filename>/proc/pressure/memory</filename> or a path to a specific
|
||||
<filename>memory.pressure</filename> file in the control group file system
|
||||
(cgroupfs).</para></listitem>
|
||||
|
||||
<listitem><para>If these environment variables are not set, the local PSI interface file
|
||||
<filename>memory.pressure</filename> of the control group the invoking process is running in is
|
||||
used.</para></listitem>
|
||||
|
||||
<listitem><para>If that file does not exist, the system-wide PSI interface file
|
||||
<filename>/proc/pressure/memory</filename> is watched instead.</para></listitem>
|
||||
</orderedlist>
|
||||
|
||||
<para>Or in other words: preferably any explicit configuration passed in by an invoking service manager
|
||||
(or similar) is used as notification source, before falling back to local notifications of the service,
|
||||
and finally to global notifications of the system.</para>
|
||||
|
||||
<para>Well-behaving services and applications are recommended to react to memory pressure events by
|
||||
executing one or more of the following operations, in order to ensure optimal behaviour even on loaded
|
||||
and resource-constrained systems:</para>
|
||||
|
||||
<itemizedlist>
|
||||
<listitem><para>Release allocation caches such as <function>malloc_trim()</function> or similar, both
|
||||
implemented in the libraries consumed by the program and in private allocation caches of the program
|
||||
itself.</para></listitem>
|
||||
|
||||
<listitem><para>Release any other form of in-memory caches that can easily be recovered if
|
||||
needed (e.g. browser caches).</para></listitem>
|
||||
|
||||
<listitem><para>Terminate idle worker threads or processes, or similar.</para></listitem>
|
||||
|
||||
<listitem><para>Even exit entirely from the program if it is idle and can be automatically started when
|
||||
needed (for example via socket or bus activation).</para></listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<para>Any of the suggested operations should help easing memory pressure situations and allowing the
|
||||
system to make progress by reclaiming the memory for other purposes.</para>
|
||||
|
||||
<para>This event source typically fires on memory pressure stalls, i.e. when operational latency above a
|
||||
configured threshold already has been seen. This should be taken into consideration when discussing
|
||||
whether later latency to re-aquire any released resources is acceptable: it's usually more important to
|
||||
think of the latencies that already happened than those coming up in future.</para>
|
||||
|
||||
<para>The <function>sd_event_source_set_memory_pressure_type()</function> and
|
||||
<function>sd_event_source_set_memory_pressure_period()</function> functions can be used to fine-tune the
|
||||
PSI parameters for pressure notifications. The former takes either <literal>some</literal>,
|
||||
<literal>full</literal> as second parameter, the latter takes threshold and period times in microseconds
|
||||
as parameters. For details about these three parameters see the PSI documentation. Note that these two
|
||||
calls must be invoked immediately after allocating the event source, as they must be configured before
|
||||
polling begins. Also note that these calls will fail if memory pressure paramterization has been passed
|
||||
in via the <varname>$MEMORY_PRESSURE_WATCH</varname>/<varname>$MEMORY_PRESSURE_WRITE</varname>
|
||||
environment variables (or in other words: configuration supplied by a service manager wins over internal
|
||||
settings).</para>
|
||||
|
||||
<para>The <function>sd_event_trim_memory()</function> function releases various internal allocation
|
||||
caches maintained by <filename>libsystemd</filename> and then invokes glibc's <citerefentry
|
||||
project='man-pages'><refentrytitle>malloc_trim</refentrytitle><manvolnum>3</manvolnum></citerefentry>. This
|
||||
makes the operation executed when the handler function parameter of
|
||||
<function>sd_event_add_memory_pressure</function> is passed as <constant>NULL</constant> directly
|
||||
accessible for invocation at any time (see above). This function will log a structured log message at
|
||||
<constant>LOG_DEBUG</constant> level (with message ID f9b0be465ad540d0850ad32172d57c21) about the memory
|
||||
pressure operation.</para>
|
||||
</refsect1>
|
||||
|
||||
<refsect1>
|
||||
<title>Return Value</title>
|
||||
|
||||
<para>On success, these functions return 0 or a positive
|
||||
integer. On failure, they return a negative errno-style error
|
||||
code.</para>
|
||||
|
||||
<refsect2>
|
||||
<title>Errors</title>
|
||||
|
||||
<para>Returned errors may indicate the following problems:</para>
|
||||
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term><constant>-ENOMEM</constant></term>
|
||||
|
||||
<listitem><para>Not enough memory to allocate an object.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-EINVAL</constant></term>
|
||||
|
||||
<listitem><para>An invalid argument has been passed.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-EHOSTDOWN</constant></term>
|
||||
|
||||
<listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname> variable has been set to the literal
|
||||
string <filename>/dev/null</filename>, in order to explicitly disable memory pressure
|
||||
handling.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-EBADMSG</constant></term>
|
||||
|
||||
<listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname> variable has been set to an invalid
|
||||
string, for example a relative rather than an absolute path.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-ENOTTY</constant></term>
|
||||
|
||||
<listitem><para>The <varname>$MEMORY_PRESSURE_WATCH</varname> variable points to a regular file
|
||||
outside of the procfs or cgroupfs file systems.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-EOPNOTSUPP</constant></term>
|
||||
|
||||
<listitem><para>No configuration via <varname>$MEMORY_PRESSURE_WATCH</varname> has been specified
|
||||
and the local kernel does not support the PSI interface.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-EBUSY</constant></term>
|
||||
|
||||
<listitem><para>This is returned by <function>sd_event_source_set_memory_pressure_type()</function>
|
||||
and <function>sd_event_source_set_memory_pressure_period()</function> if invoked on event sources
|
||||
at a time later than immediately after allocting them.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-ESTALE</constant></term>
|
||||
|
||||
<listitem><para>The event loop is already terminated.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-ECHILD</constant></term>
|
||||
|
||||
<listitem><para>The event loop has been created in a different process.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><constant>-EDOM</constant></term>
|
||||
|
||||
<listitem><para>The passed event source is not a signal event source.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect2>
|
||||
</refsect1>
|
||||
|
||||
<xi:include href="libsystemd-pkgconfig.xml" />
|
||||
|
||||
<refsect1>
|
||||
<title>See Also</title>
|
||||
|
||||
<para>
|
||||
<citerefentry><refentrytitle>systemd</refentrytitle><manvolnum>1</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd-event</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_new</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_add_io</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_add_time</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_add_child</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_add_inotify</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_add_defer</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_source_set_enabled</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_source_set_description</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_source_set_userdata</refentrytitle><manvolnum>3</manvolnum></citerefentry>,
|
||||
<citerefentry><refentrytitle>sd_event_source_set_floating</refentrytitle><manvolnum>3</manvolnum></citerefentry>
|
||||
</para>
|
||||
</refsect1>
|
||||
|
||||
</refentry>
|
@ -28,3 +28,8 @@ int read_resource_pressure(const char *path, PressureType type, ResourcePressure
|
||||
|
||||
/* Was the kernel compiled with CONFIG_PSI=y? 1 if yes, 0 if not, negative on error. */
|
||||
int is_pressure_supported(void);
|
||||
|
||||
/* Default parameters for memory pressure watch logic in sd-event and PID 1 */
|
||||
#define MEMORY_PRESSURE_DEFAULT_TYPE "some"
|
||||
#define MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC (100 * USEC_PER_MSEC)
|
||||
#define MEMORY_PRESSURE_DEFAULT_WINDOW_USEC USEC_PER_SEC
|
||||
|
@ -27,6 +27,7 @@ typedef enum EventSourceType {
|
||||
SOURCE_EXIT,
|
||||
SOURCE_WATCHDOG,
|
||||
SOURCE_INOTIFY,
|
||||
SOURCE_MEMORY_PRESSURE,
|
||||
_SOURCE_EVENT_SOURCE_TYPE_MAX,
|
||||
_SOURCE_EVENT_SOURCE_TYPE_INVALID = -EINVAL,
|
||||
} EventSourceType;
|
||||
@ -129,6 +130,17 @@ struct sd_event_source {
|
||||
struct inode_data *inode_data;
|
||||
LIST_FIELDS(sd_event_source, by_inode_data);
|
||||
} inotify;
|
||||
struct {
|
||||
int fd;
|
||||
sd_event_handler_t callback;
|
||||
void *write_buffer;
|
||||
size_t write_buffer_size;
|
||||
uint32_t events, revents;
|
||||
LIST_FIELDS(sd_event_source, write_list);
|
||||
bool registered:1;
|
||||
bool locked:1;
|
||||
bool in_write_list:1;
|
||||
} memory_pressure;
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "sd-daemon.h"
|
||||
#include "sd-event.h"
|
||||
#include "sd-id128.h"
|
||||
#include "sd-messages.h"
|
||||
|
||||
#include "alloc-util.h"
|
||||
#include "env-util.h"
|
||||
@ -15,15 +16,22 @@
|
||||
#include "fs-util.h"
|
||||
#include "glyph-util.h"
|
||||
#include "hashmap.h"
|
||||
#include "hexdecoct.h"
|
||||
#include "list.h"
|
||||
#include "logarithm.h"
|
||||
#include "macro.h"
|
||||
#include "mallinfo-util.h"
|
||||
#include "memory-util.h"
|
||||
#include "missing_magic.h"
|
||||
#include "missing_syscall.h"
|
||||
#include "path-util.h"
|
||||
#include "prioq.h"
|
||||
#include "process-util.h"
|
||||
#include "psi-util.h"
|
||||
#include "set.h"
|
||||
#include "signal-util.h"
|
||||
#include "socket-util.h"
|
||||
#include "stat-util.h"
|
||||
#include "string-table.h"
|
||||
#include "string-util.h"
|
||||
#include "strxcpyx.h"
|
||||
@ -63,6 +71,7 @@ static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX]
|
||||
[SOURCE_EXIT] = "exit",
|
||||
[SOURCE_WATCHDOG] = "watchdog",
|
||||
[SOURCE_INOTIFY] = "inotify",
|
||||
[SOURCE_MEMORY_PRESSURE] = "memory-pressure",
|
||||
};
|
||||
|
||||
DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
|
||||
@ -85,7 +94,8 @@ DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
|
||||
SOURCE_TIME_BOOTTIME_ALARM, \
|
||||
SOURCE_SIGNAL, \
|
||||
SOURCE_DEFER, \
|
||||
SOURCE_INOTIFY)
|
||||
SOURCE_INOTIFY, \
|
||||
SOURCE_MEMORY_PRESSURE)
|
||||
|
||||
/* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
|
||||
* Time sources and ratelimited sources can be passed, so effectively this is the same as the
|
||||
@ -130,6 +140,9 @@ struct sd_event {
|
||||
/* A list of inotify objects that already have events buffered which aren't processed yet */
|
||||
LIST_HEAD(struct inotify_data, buffered_inotify_data_list);
|
||||
|
||||
/* A list of memory pressure event sources that still need their subscription string written */
|
||||
LIST_HEAD(sd_event_source, memory_pressure_write_list);
|
||||
|
||||
pid_t original_pid;
|
||||
|
||||
uint64_t iteration;
|
||||
@ -524,6 +537,65 @@ static int source_child_pidfd_register(sd_event_source *s, int enabled) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void source_memory_pressure_unregister(sd_event_source *s) {
|
||||
assert(s);
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
|
||||
if (event_pid_changed(s->event))
|
||||
return;
|
||||
|
||||
if (!s->memory_pressure.registered)
|
||||
return;
|
||||
|
||||
if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->memory_pressure.fd, NULL) < 0)
|
||||
log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
|
||||
strna(s->description), event_source_type_to_string(s->type));
|
||||
|
||||
s->memory_pressure.registered = false;
|
||||
}
|
||||
|
||||
static int source_memory_pressure_register(sd_event_source *s, int enabled) {
|
||||
assert(s);
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
assert(enabled != SD_EVENT_OFF);
|
||||
|
||||
struct epoll_event ev = {
|
||||
.events = s->memory_pressure.write_buffer_size > 0 ? EPOLLOUT :
|
||||
(s->memory_pressure.events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0)),
|
||||
.data.ptr = s,
|
||||
};
|
||||
|
||||
if (epoll_ctl(s->event->epoll_fd,
|
||||
s->memory_pressure.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
|
||||
s->memory_pressure.fd, &ev) < 0)
|
||||
return -errno;
|
||||
|
||||
s->memory_pressure.registered = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void source_memory_pressure_add_to_write_list(sd_event_source *s) {
|
||||
assert(s);
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
|
||||
if (s->memory_pressure.in_write_list)
|
||||
return;
|
||||
|
||||
LIST_PREPEND(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
|
||||
s->memory_pressure.in_write_list = true;
|
||||
}
|
||||
|
||||
static void source_memory_pressure_remove_from_write_list(sd_event_source *s) {
|
||||
assert(s);
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
|
||||
if (!s->memory_pressure.in_write_list)
|
||||
return;
|
||||
|
||||
LIST_REMOVE(memory_pressure.write_list, s->event->memory_pressure_write_list, s);
|
||||
s->memory_pressure.in_write_list = false;
|
||||
}
|
||||
|
||||
static clockid_t event_source_type_to_clock(EventSourceType t) {
|
||||
|
||||
switch (t) {
|
||||
@ -947,6 +1019,11 @@ static void source_disconnect(sd_event_source *s) {
|
||||
break;
|
||||
}
|
||||
|
||||
case SOURCE_MEMORY_PRESSURE:
|
||||
source_memory_pressure_remove_from_write_list(s);
|
||||
source_memory_pressure_unregister(s);
|
||||
break;
|
||||
|
||||
default:
|
||||
assert_not_reached();
|
||||
}
|
||||
@ -1017,6 +1094,11 @@ static sd_event_source* source_free(sd_event_source *s) {
|
||||
s->child.pidfd = safe_close(s->child.pidfd);
|
||||
}
|
||||
|
||||
if (s->type == SOURCE_MEMORY_PRESSURE) {
|
||||
s->memory_pressure.fd = safe_close(s->memory_pressure.fd);
|
||||
s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
|
||||
}
|
||||
|
||||
if (s->destroy_callback)
|
||||
s->destroy_callback(s->userdata);
|
||||
|
||||
@ -1092,6 +1174,7 @@ static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType t
|
||||
[SOURCE_POST] = endoffsetof_field(sd_event_source, post),
|
||||
[SOURCE_EXIT] = endoffsetof_field(sd_event_source, exit),
|
||||
[SOURCE_INOTIFY] = endoffsetof_field(sd_event_source, inotify),
|
||||
[SOURCE_MEMORY_PRESSURE] = endoffsetof_field(sd_event_source, memory_pressure),
|
||||
};
|
||||
|
||||
sd_event_source *s;
|
||||
@ -1771,6 +1854,257 @@ _public_ int sd_event_add_exit(
|
||||
return 0;
|
||||
}
|
||||
|
||||
int sd_event_trim_memory(void) {
|
||||
int r;
|
||||
|
||||
/* A default implementation of a memory pressure callback. Simply releases our own allocation caches
|
||||
* and glibc's. This is automatically used when people call sd_event_add_memory_pressure() with a
|
||||
* NULL callback parameter. */
|
||||
|
||||
log_debug("Memory pressure event, trimming malloc() memory.");
|
||||
|
||||
#if HAVE_GENERIC_MALLINFO
|
||||
generic_mallinfo before_mallinfo = generic_mallinfo_get();
|
||||
#endif
|
||||
|
||||
usec_t before_timestamp = now(CLOCK_MONOTONIC);
|
||||
hashmap_trim_pools();
|
||||
r = malloc_trim(0);
|
||||
usec_t after_timestamp = now(CLOCK_MONOTONIC);
|
||||
|
||||
if (r > 0)
|
||||
log_debug("Successfully trimmed some memory.");
|
||||
else
|
||||
log_debug("Couldn't trim any memory.");
|
||||
|
||||
usec_t period = after_timestamp - before_timestamp;
|
||||
|
||||
#if HAVE_GENERIC_MALLINFO
|
||||
generic_mallinfo after_mallinfo = generic_mallinfo_get();
|
||||
size_t l = LESS_BY((size_t) before_mallinfo.hblkhd, (size_t) after_mallinfo.hblkhd) +
|
||||
LESS_BY((size_t) before_mallinfo.arena, (size_t) after_mallinfo.arena);
|
||||
log_struct(LOG_DEBUG,
|
||||
LOG_MESSAGE("Memory trimming took %s, returned %s to OS.",
|
||||
FORMAT_TIMESPAN(period, 0),
|
||||
FORMAT_BYTES(l)),
|
||||
"MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
|
||||
"TRIMMED_BYTES=%zu", l,
|
||||
"TRIMMED_USEC=" USEC_FMT, period);
|
||||
#else
|
||||
log_struct(LOG_DEBUG,
|
||||
LOG_MESSAGE("Memory trimming took %s.",
|
||||
FORMAT_TIMESPAN(period, 0)),
|
||||
"MESSAGE_ID=" SD_MESSAGE_MEMORY_TRIM_STR,
|
||||
"TRIMMED_USEC=" USEC_FMT, period);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int memory_pressure_callback(sd_event_source *s, void *userdata) {
|
||||
assert(s);
|
||||
|
||||
sd_event_trim_memory();
|
||||
return 0;
|
||||
}
|
||||
|
||||
_public_ int sd_event_add_memory_pressure(
|
||||
sd_event *e,
|
||||
sd_event_source **ret,
|
||||
sd_event_handler_t callback,
|
||||
void *userdata) {
|
||||
|
||||
_cleanup_free_ char *w = NULL;
|
||||
_cleanup_(source_freep) sd_event_source *s = NULL;
|
||||
_cleanup_close_ int path_fd = -1, fd = -1;
|
||||
_cleanup_free_ void *write_buffer = NULL;
|
||||
const char *watch, *watch_fallback, *env;
|
||||
size_t write_buffer_size = 0;
|
||||
struct stat st;
|
||||
uint32_t events;
|
||||
bool locked;
|
||||
int r;
|
||||
|
||||
assert_return(e, -EINVAL);
|
||||
assert_return(e = event_resolve(e), -ENOPKG);
|
||||
assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
|
||||
assert_return(!event_pid_changed(e), -ECHILD);
|
||||
|
||||
if (!callback)
|
||||
callback = memory_pressure_callback;
|
||||
|
||||
s = source_new(e, !ret, SOURCE_MEMORY_PRESSURE);
|
||||
if (!s)
|
||||
return -ENOMEM;
|
||||
|
||||
s->wakeup = WAKEUP_EVENT_SOURCE;
|
||||
s->memory_pressure.callback = callback;
|
||||
s->userdata = userdata;
|
||||
s->enabled = SD_EVENT_ON;
|
||||
s->memory_pressure.fd = -EBADF;
|
||||
|
||||
env = secure_getenv("MEMORY_PRESSURE_WATCH");
|
||||
if (env) {
|
||||
if (isempty(env) || path_equal(env, "/dev/null"))
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN),
|
||||
"Memory pressure logic is explicitly disabled via $MEMORY_PRESSURE_WATCH.");
|
||||
|
||||
if (!path_is_absolute(env) || !path_is_normalized(env))
|
||||
return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
|
||||
"$MEMORY_PRESSURE_WATCH set to invalid path: %s", env);
|
||||
|
||||
watch = env;
|
||||
|
||||
env = secure_getenv("MEMORY_PRESSURE_WRITE");
|
||||
if (env) {
|
||||
r = unbase64mem(env, SIZE_MAX, &write_buffer, &write_buffer_size);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
locked = true;
|
||||
} else {
|
||||
|
||||
r = is_pressure_supported();
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* By default we want to watch memory pressure on the local cgroup, but we'll fall back on
|
||||
* the system wide pressure if for some reason we cannot (which could be: memory controller
|
||||
* not delegated to us, or PSI simply not available in the kernel). On legacy cgroupv1 we'll
|
||||
* only use the system-wide logic. */
|
||||
r = cg_all_unified();
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r == 0)
|
||||
watch = "/proc/pressure/memory";
|
||||
else {
|
||||
_cleanup_free_ char *cg = NULL;
|
||||
|
||||
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cg);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
w = path_join("/sys/fs/cgroup", cg, "memory.pressure");
|
||||
if (!w)
|
||||
return -ENOMEM;
|
||||
|
||||
watch = w;
|
||||
watch_fallback = "/proc/pressure/memory";
|
||||
}
|
||||
|
||||
/* Android uses three levels in its userspace low memory killer logic:
|
||||
* some 70000 1000000
|
||||
* some 100000 1000000
|
||||
* full 70000 1000000
|
||||
*
|
||||
* GNOME's low memory monitor uses:
|
||||
* some 70000 1000000
|
||||
* some 100000 1000000
|
||||
* full 100000 1000000
|
||||
*
|
||||
* We'll default to the middle level that both agree on */
|
||||
if (asprintf((char**) &write_buffer,
|
||||
"%s " USEC_FMT " " USEC_FMT,
|
||||
MEMORY_PRESSURE_DEFAULT_TYPE,
|
||||
MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC,
|
||||
MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
|
||||
return -ENOMEM;
|
||||
|
||||
write_buffer_size = strlen(write_buffer) + 1;
|
||||
locked = false;
|
||||
}
|
||||
|
||||
path_fd = open(watch, O_PATH|O_CLOEXEC);
|
||||
if (path_fd < 0) {
|
||||
if (errno != ENOENT)
|
||||
return -errno;
|
||||
|
||||
/* We got ENOENT. Three options now: try the fallback if we have one, or return the error as
|
||||
* is (if based on user/env config), or return -EOPNOTSUPP (because we picked the path, and
|
||||
* the PSI service apparently is not supported) */
|
||||
if (!watch_fallback)
|
||||
return locked ? -ENOENT : -EOPNOTSUPP;
|
||||
|
||||
path_fd = open(watch_fallback, O_PATH|O_CLOEXEC);
|
||||
if (errno == ENOENT) /* PSI is not available in the kernel even under the fallback path? */
|
||||
return -EOPNOTSUPP;
|
||||
if (errno < 0)
|
||||
return -errno;
|
||||
}
|
||||
|
||||
if (fstat(path_fd, &st) < 0)
|
||||
return -errno;
|
||||
|
||||
if (S_ISSOCK(st.st_mode)) {
|
||||
fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
|
||||
if (fd < 0)
|
||||
return -errno;
|
||||
|
||||
r = connect_unix_path(fd, path_fd, NULL);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
events = EPOLLIN;
|
||||
|
||||
} else if (S_ISREG(st.st_mode) || S_ISFIFO(st.st_mode) || S_ISCHR(st.st_mode)) {
|
||||
fd = fd_reopen(path_fd, (write_buffer_size > 0 ? O_RDWR : O_RDONLY) |O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
if (S_ISREG(st.st_mode)) {
|
||||
struct statfs sfs;
|
||||
|
||||
/* If this is a regular file validate this is a procfs or cgroupfs file, where we look for EPOLLPRI */
|
||||
|
||||
if (fstatfs(fd, &sfs) < 0)
|
||||
return -errno;
|
||||
|
||||
if (!is_fs_type(&sfs, PROC_SUPER_MAGIC) &&
|
||||
!is_fs_type(&sfs, CGROUP2_SUPER_MAGIC))
|
||||
return -ENOTTY;
|
||||
|
||||
events = EPOLLPRI;
|
||||
} else
|
||||
/* For fifos and char devices just watch for EPOLLIN */
|
||||
events = EPOLLIN;
|
||||
|
||||
} else if (S_ISDIR(st.st_mode))
|
||||
return -EISDIR;
|
||||
else
|
||||
return -EBADF;
|
||||
|
||||
s->memory_pressure.fd = TAKE_FD(fd);
|
||||
s->memory_pressure.write_buffer = TAKE_PTR(write_buffer);
|
||||
s->memory_pressure.write_buffer_size = write_buffer_size;
|
||||
s->memory_pressure.events = events;
|
||||
s->memory_pressure.locked = locked;
|
||||
|
||||
/* So here's the thing: if we are talking to PSI we need to write the watch string before adding the
|
||||
* fd to epoll (if we ignore this, then the watch won't work). Hence we'll not actually register the
|
||||
* fd with the epoll right-away. Instead, we just add the event source to a list of memory pressure
|
||||
* event sources on which writes must be executed before the first event loop iteration is
|
||||
* executed. (We could also write the data here, right away, but we want to give the caller the
|
||||
* freedom to call sd_event_source_set_memory_pressure_type() and
|
||||
* sd_event_source_set_memory_pressure_rate() before we write it. */
|
||||
|
||||
if (s->memory_pressure.write_buffer_size > 0)
|
||||
source_memory_pressure_add_to_write_list(s);
|
||||
else {
|
||||
r = source_memory_pressure_register(s, s->enabled);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
if (ret)
|
||||
*ret = s;
|
||||
TAKE_PTR(s);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
|
||||
assert(e);
|
||||
|
||||
@ -2562,6 +2896,10 @@ static int event_source_offline(
|
||||
prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
|
||||
break;
|
||||
|
||||
case SOURCE_MEMORY_PRESSURE:
|
||||
source_memory_pressure_unregister(s);
|
||||
break;
|
||||
|
||||
case SOURCE_TIME_REALTIME:
|
||||
case SOURCE_TIME_BOOTTIME:
|
||||
case SOURCE_TIME_MONOTONIC:
|
||||
@ -2649,6 +2987,13 @@ static int event_source_online(
|
||||
s->event->n_online_child_sources++;
|
||||
break;
|
||||
|
||||
case SOURCE_MEMORY_PRESSURE:
|
||||
r = source_memory_pressure_register(s, enabled);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
break;
|
||||
|
||||
case SOURCE_TIME_REALTIME:
|
||||
case SOURCE_TIME_BOOTTIME:
|
||||
case SOURCE_TIME_MONOTONIC:
|
||||
@ -3630,6 +3975,106 @@ static int process_inotify(sd_event *e) {
|
||||
return done;
|
||||
}
|
||||
|
||||
static int process_memory_pressure(sd_event_source *s, uint32_t revents) {
|
||||
assert(s);
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
|
||||
if (s->pending)
|
||||
s->memory_pressure.revents |= revents;
|
||||
else
|
||||
s->memory_pressure.revents = revents;
|
||||
|
||||
return source_set_pending(s, true);
|
||||
}
|
||||
|
||||
static int source_memory_pressure_write(sd_event_source *s) {
|
||||
ssize_t n;
|
||||
int r;
|
||||
|
||||
assert(s);
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
|
||||
/* once we start writing, the buffer is locked, we allow no further changes. */
|
||||
s->memory_pressure.locked = true;
|
||||
|
||||
if (s->memory_pressure.write_buffer_size > 0) {
|
||||
n = write(s->memory_pressure.fd, s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size);
|
||||
if (n < 0) {
|
||||
if (!ERRNO_IS_TRANSIENT(errno))
|
||||
return -errno;
|
||||
|
||||
n = 0;
|
||||
}
|
||||
} else
|
||||
n = 0;
|
||||
|
||||
assert(n >= 0);
|
||||
|
||||
if ((size_t) n == s->memory_pressure.write_buffer_size) {
|
||||
s->memory_pressure.write_buffer = mfree(s->memory_pressure.write_buffer);
|
||||
|
||||
if (n > 0) {
|
||||
s->memory_pressure.write_buffer_size = 0;
|
||||
|
||||
/* Update epoll events mask, since we have now written everything and don't care for EPOLLOUT anymore */
|
||||
r = source_memory_pressure_register(s, s->enabled);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
} else if (n > 0) {
|
||||
_cleanup_free_ void *c = NULL;
|
||||
|
||||
assert((size_t) n < s->memory_pressure.write_buffer_size);
|
||||
|
||||
c = memdup((uint8_t*) s->memory_pressure.write_buffer + n, s->memory_pressure.write_buffer_size - n);
|
||||
if (!c)
|
||||
return -ENOMEM;
|
||||
|
||||
free_and_replace(s->memory_pressure.write_buffer, c);
|
||||
s->memory_pressure.write_buffer_size -= n;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int source_memory_pressure_initiate_dispatch(sd_event_source *s) {
|
||||
int r;
|
||||
|
||||
assert(s);
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
|
||||
r = source_memory_pressure_write(s);
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r > 0)
|
||||
return 1; /* if we wrote something, then don't continue with dispatching user dispatch
|
||||
* function. Instead, shortcut it so that we wait for next EPOLLOUT immediately. */
|
||||
|
||||
/* No pending incoming IO? Then let's not continue further */
|
||||
if ((s->memory_pressure.revents & (EPOLLIN|EPOLLPRI)) == 0) {
|
||||
|
||||
/* Treat IO errors on the notifier the same ways errors returned from a callback */
|
||||
if ((s->memory_pressure.revents & (EPOLLHUP|EPOLLERR|EPOLLRDHUP)) != 0)
|
||||
return -EIO;
|
||||
|
||||
return 1; /* leave dispatch, we already processed everything */
|
||||
}
|
||||
|
||||
if (s->memory_pressure.revents & EPOLLIN) {
|
||||
uint8_t pipe_buf[PIPE_BUF];
|
||||
ssize_t n;
|
||||
|
||||
/* If the fd is readable, then flush out anything that might be queued */
|
||||
|
||||
n = read(s->memory_pressure.fd, pipe_buf, sizeof(pipe_buf));
|
||||
if (n < 0 && !ERRNO_IS_TRANSIENT(errno))
|
||||
return -errno;
|
||||
}
|
||||
|
||||
return 0; /* go on, dispatch to user callback */
|
||||
}
|
||||
|
||||
static int source_dispatch(sd_event_source *s) {
|
||||
EventSourceType saved_type;
|
||||
sd_event *saved_event;
|
||||
@ -3678,6 +4123,16 @@ static int source_dispatch(sd_event_source *s) {
|
||||
}
|
||||
}
|
||||
|
||||
if (s->type == SOURCE_MEMORY_PRESSURE) {
|
||||
r = source_memory_pressure_initiate_dispatch(s);
|
||||
if (r == -EIO) /* handle EIO errors similar to callback errors */
|
||||
goto finish;
|
||||
if (r < 0)
|
||||
return r;
|
||||
if (r > 0) /* already handled */
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (s->enabled == SD_EVENT_ONESHOT) {
|
||||
r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
|
||||
if (r < 0)
|
||||
@ -3764,6 +4219,10 @@ static int source_dispatch(sd_event_source *s) {
|
||||
break;
|
||||
}
|
||||
|
||||
case SOURCE_MEMORY_PRESSURE:
|
||||
r = s->memory_pressure.callback(s, s->userdata);
|
||||
break;
|
||||
|
||||
case SOURCE_WATCHDOG:
|
||||
case _SOURCE_EVENT_SOURCE_TYPE_MAX:
|
||||
case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
|
||||
@ -3772,6 +4231,7 @@ static int source_dispatch(sd_event_source *s) {
|
||||
|
||||
s->dispatching = false;
|
||||
|
||||
finish:
|
||||
if (r < 0) {
|
||||
log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
|
||||
strna(s->description),
|
||||
@ -3922,6 +4382,30 @@ static void event_close_inode_data_fds(sd_event *e) {
|
||||
}
|
||||
}
|
||||
|
||||
static int event_memory_pressure_write_list(sd_event *e) {
|
||||
int r;
|
||||
|
||||
assert(e);
|
||||
|
||||
for (;;) {
|
||||
sd_event_source *s;
|
||||
|
||||
s = LIST_POP(memory_pressure.write_list, e->memory_pressure_write_list);
|
||||
if (!s)
|
||||
break;
|
||||
|
||||
assert(s->type == SOURCE_MEMORY_PRESSURE);
|
||||
assert(s->memory_pressure.write_buffer_size > 0);
|
||||
s->memory_pressure.in_write_list = false;
|
||||
|
||||
r = source_memory_pressure_write(s);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
_public_ int sd_event_prepare(sd_event *e) {
|
||||
int r;
|
||||
|
||||
@ -3950,6 +4434,10 @@ _public_ int sd_event_prepare(sd_event *e) {
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = event_memory_pressure_write_list(e);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = event_arm_timer(e, &e->realtime);
|
||||
if (r < 0)
|
||||
return r;
|
||||
@ -4115,6 +4603,10 @@ static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t
|
||||
r = process_pidfd(e, s, e->event_queue[i].events);
|
||||
break;
|
||||
|
||||
case SOURCE_MEMORY_PRESSURE:
|
||||
r = process_memory_pressure(s, e->event_queue[i].events);
|
||||
break;
|
||||
|
||||
default:
|
||||
assert_not_reached();
|
||||
}
|
||||
@ -4700,3 +5192,91 @@ _public_ int sd_event_set_signal_exit(sd_event *e, int b) {
|
||||
|
||||
return change;
|
||||
}
|
||||
|
||||
_public_ int sd_event_source_set_memory_pressure_type(sd_event_source *s, const char *ty) {
|
||||
_cleanup_free_ char *b = NULL;
|
||||
_cleanup_free_ void *w = NULL;
|
||||
|
||||
assert_return(s, -EINVAL);
|
||||
assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
|
||||
assert_return(ty, -EINVAL);
|
||||
|
||||
if (!STR_IN_SET(ty, "some", "full"))
|
||||
return -EINVAL;
|
||||
|
||||
if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
|
||||
return -EBUSY;
|
||||
|
||||
char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
|
||||
if (!space)
|
||||
return -EINVAL;
|
||||
|
||||
size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
|
||||
b = memdup_suffix0(s->memory_pressure.write_buffer, l);
|
||||
if (!b)
|
||||
return -ENOMEM;
|
||||
if (!STR_IN_SET(b, "some", "full"))
|
||||
return -EINVAL;
|
||||
|
||||
if (streq(b, ty))
|
||||
return 0;
|
||||
|
||||
size_t nl = strlen(ty) + (s->memory_pressure.write_buffer_size - l);
|
||||
w = new(char, nl);
|
||||
if (!w)
|
||||
return -ENOMEM;
|
||||
|
||||
memcpy(stpcpy(w, ty), space, (s->memory_pressure.write_buffer_size - l));
|
||||
|
||||
free_and_replace(s->memory_pressure.write_buffer, w);
|
||||
s->memory_pressure.write_buffer_size = nl;
|
||||
s->memory_pressure.locked = false;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
_public_ int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec) {
|
||||
_cleanup_free_ char *b = NULL;
|
||||
_cleanup_free_ void *w = NULL;
|
||||
|
||||
assert_return(s, -EINVAL);
|
||||
assert_return(s->type == SOURCE_MEMORY_PRESSURE, -EDOM);
|
||||
|
||||
if (threshold_usec <= 0 || threshold_usec >= UINT64_MAX)
|
||||
return -ERANGE;
|
||||
if (window_usec <= 0 || window_usec >= UINT64_MAX)
|
||||
return -ERANGE;
|
||||
if (threshold_usec > window_usec)
|
||||
return -EINVAL;
|
||||
|
||||
if (s->memory_pressure.locked) /* Refuse adjusting parameters, if caller told us how to watch for events */
|
||||
return -EBUSY;
|
||||
|
||||
char* space = memchr(s->memory_pressure.write_buffer, ' ', s->memory_pressure.write_buffer_size);
|
||||
if (!space)
|
||||
return -EINVAL;
|
||||
|
||||
size_t l = (char*) space - (char*) s->memory_pressure.write_buffer;
|
||||
b = memdup_suffix0(s->memory_pressure.write_buffer, l);
|
||||
if (!b)
|
||||
return -ENOMEM;
|
||||
if (!STR_IN_SET(b, "some", "full"))
|
||||
return -EINVAL;
|
||||
|
||||
if (asprintf((char**) &w,
|
||||
"%s " USEC_FMT " " USEC_FMT "",
|
||||
b,
|
||||
threshold_usec,
|
||||
window_usec) < 0)
|
||||
return -EINVAL;
|
||||
|
||||
l = strlen(w) + 1;
|
||||
if (memcmp_nn(s->memory_pressure.write_buffer, s->memory_pressure.write_buffer_size, w, l) == 0)
|
||||
return 0;
|
||||
|
||||
free_and_replace(s->memory_pressure.write_buffer, w);
|
||||
s->memory_pressure.write_buffer_size = l;
|
||||
s->memory_pressure.locked = false;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
@ -99,6 +99,7 @@ int sd_event_add_inotify_fd(sd_event *e, sd_event_source **s, int fd, uint32_t m
|
||||
int sd_event_add_defer(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
|
||||
int sd_event_add_post(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
|
||||
int sd_event_add_exit(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
|
||||
int sd_event_add_memory_pressure(sd_event *e, sd_event_source **s, sd_event_handler_t callback, void *userdata);
|
||||
|
||||
int sd_event_prepare(sd_event *e);
|
||||
int sd_event_wait(sd_event *e, uint64_t usec);
|
||||
@ -160,6 +161,8 @@ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo
|
||||
int sd_event_source_send_child_signal(sd_event_source *s, int sig, const void *si, unsigned flags);
|
||||
#endif
|
||||
int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *ret);
|
||||
int sd_event_source_set_memory_pressure_type(sd_event_source *e, const char *ty);
|
||||
int sd_event_source_set_memory_pressure_period(sd_event_source *s, uint64_t threshold_usec, uint64_t window_usec);
|
||||
int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback);
|
||||
int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret);
|
||||
int sd_event_source_get_floating(sd_event_source *s);
|
||||
@ -171,6 +174,8 @@ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval_use
|
||||
int sd_event_source_is_ratelimited(sd_event_source *s);
|
||||
int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback);
|
||||
|
||||
int sd_event_trim_memory(void);
|
||||
|
||||
/* Define helpers so that __attribute__((cleanup(sd_event_unrefp))) and similar may be used. */
|
||||
_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event, sd_event_unref);
|
||||
_SD_DEFINE_POINTER_CLEANUP_FUNC(sd_event_source, sd_event_source_unref);
|
||||
|
@ -195,6 +195,9 @@ _SD_BEGIN_DECLARATIONS;
|
||||
#define SD_MESSAGE_TPM_PCR_EXTEND SD_ID128_MAKE(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab)
|
||||
#define SD_MESSAGE_TPM_PCR_EXTEND_STR SD_ID128_MAKE_STR(3f,7d,5e,f3,e5,4f,43,02,b4,f0,b1,43,bb,27,0c,ab)
|
||||
|
||||
#define SD_MESSAGE_MEMORY_TRIM SD_ID128_MAKE(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21)
|
||||
#define SD_MESSAGE_MEMORY_TRIM_STR SD_ID128_MAKE_STR(f9,b0,be,46,5a,d5,40,d0,85,0a,d3,21,72,d5,7c,21)
|
||||
|
||||
_SD_END_DECLARATIONS;
|
||||
|
||||
#endif
|
||||
|
@ -386,6 +386,10 @@ tests += [
|
||||
'sources' : files('test-math-util.c'),
|
||||
'dependencies' : libm,
|
||||
},
|
||||
{
|
||||
'sources' : files('test-mempress.c'),
|
||||
'dependencies' : threads,
|
||||
},
|
||||
{
|
||||
'sources' : files('test-namespace.c'),
|
||||
'dependencies' : [
|
||||
|
309
src/test/test-mempress.c
Normal file
309
src/test/test-mempress.c
Normal file
@ -0,0 +1,309 @@
|
||||
/* SPDX-License-Identifier: LGPL-2.1-or-later */
|
||||
|
||||
#include <fcntl.h>
|
||||
#include <pthread.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <sd-bus.h>
|
||||
#include <sd-event.h>
|
||||
|
||||
#include "bus-locator.h"
|
||||
#include "bus-wait-for-jobs.h"
|
||||
#include "fd-util.h"
|
||||
#include "path-util.h"
|
||||
#include "process-util.h"
|
||||
#include "random-util.h"
|
||||
#include "rm-rf.h"
|
||||
#include "signal-util.h"
|
||||
#include "socket-util.h"
|
||||
#include "tests.h"
|
||||
#include "tmpfile-util.h"
|
||||
#include "unit-def.h"
|
||||
|
||||
struct fake_pressure_context {
|
||||
int fifo_fd;
|
||||
int socket_fd;
|
||||
};
|
||||
|
||||
static void *fake_pressure_thread(void *p) {
|
||||
_cleanup_free_ struct fake_pressure_context *c = ASSERT_PTR(p);
|
||||
_cleanup_close_ int cfd = -1;
|
||||
|
||||
usleep(150);
|
||||
|
||||
assert_se(write(c->fifo_fd, &(const char) { 'x' }, 1) == 1);
|
||||
|
||||
usleep(150);
|
||||
|
||||
cfd = accept4(c->socket_fd, NULL, NULL, SOCK_CLOEXEC);
|
||||
assert_se(cfd >= 0);
|
||||
char buf[STRLEN("hello")+1] = {};
|
||||
assert_se(read(cfd, buf, sizeof(buf)-1) == sizeof(buf)-1);
|
||||
assert_se(streq(buf, "hello"));
|
||||
assert_se(write(cfd, &(const char) { 'z' }, 1) == 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fake_pressure_callback(sd_event_source *s, void *userdata) {
|
||||
int *value = userdata;
|
||||
const char *d;
|
||||
|
||||
assert_se(s);
|
||||
assert_se(sd_event_source_get_description(s, &d) >= 0);
|
||||
|
||||
*value *= d[0];
|
||||
|
||||
log_notice("memory pressure event: %s", d);
|
||||
|
||||
if (*value == 7 * 'f' * 's')
|
||||
assert_se(sd_event_exit(sd_event_source_get_event(s), 0) >= 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
TEST(fake_pressure) {
|
||||
_cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *ef = NULL;
|
||||
_cleanup_(sd_event_unrefp) sd_event *e = NULL;
|
||||
_cleanup_free_ char *j = NULL, *k = NULL;
|
||||
_cleanup_(rm_rf_physical_and_freep) char *tmp = NULL;
|
||||
_cleanup_close_ int fifo_fd = -1, socket_fd = -1;
|
||||
union sockaddr_union sa;
|
||||
pthread_t th;
|
||||
int value = 7;
|
||||
|
||||
assert_se(sd_event_default(&e) >= 0);
|
||||
|
||||
assert_se(mkdtemp_malloc(NULL, &tmp) >= 0);
|
||||
|
||||
assert_se(j = path_join(tmp, "fifo"));
|
||||
assert_se(mkfifo(j, 0600) >= 0);
|
||||
fifo_fd = open(j, O_CLOEXEC|O_RDWR|O_NONBLOCK);
|
||||
assert_se(fifo_fd >= 0);
|
||||
|
||||
assert_se(k = path_join(tmp, "sock"));
|
||||
socket_fd = socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC, 0);
|
||||
assert_se(socket_fd >= 0);
|
||||
assert_se(sockaddr_un_set_path(&sa.un, k) >= 0);
|
||||
assert_se(bind(socket_fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) >= 0);
|
||||
assert_se(listen(socket_fd, 1) >= 0);
|
||||
|
||||
/* Ideally we'd just allocate this on the stack, but AddressSanitizer doesn't like it if threads
|
||||
* access each other's stack */
|
||||
struct fake_pressure_context *fp = new(struct fake_pressure_context, 1);
|
||||
assert_se(fp);
|
||||
*fp = (struct fake_pressure_context) {
|
||||
.fifo_fd = fifo_fd,
|
||||
.socket_fd = socket_fd,
|
||||
};
|
||||
|
||||
assert_se(pthread_create(&th, NULL, fake_pressure_thread, TAKE_PTR(fp)) == 0);
|
||||
|
||||
assert_se(setenv("MEMORY_PRESSURE_WATCH", j, /* override= */ true) >= 0);
|
||||
assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0);
|
||||
|
||||
assert_se(sd_event_add_memory_pressure(e, &es, fake_pressure_callback, &value) >= 0);
|
||||
assert_se(sd_event_source_set_description(es, "fifo event source") >= 0);
|
||||
|
||||
assert_se(setenv("MEMORY_PRESSURE_WATCH", k, /* override= */ true) >= 0);
|
||||
assert_se(setenv("MEMORY_PRESSURE_WRITE", "aGVsbG8K", /* override= */ true) >= 0);
|
||||
|
||||
assert_se(sd_event_add_memory_pressure(e, &ef, fake_pressure_callback, &value) >= 0);
|
||||
assert_se(sd_event_source_set_description(ef, "socket event source") >= 0);
|
||||
|
||||
assert_se(sd_event_loop(e) >= 0);
|
||||
|
||||
assert_se(value == 7 * 'f' * 's');
|
||||
|
||||
assert_se(pthread_join(th, NULL) == 0);
|
||||
}
|
||||
|
||||
struct real_pressure_context {
|
||||
sd_event_source *pid;
|
||||
};
|
||||
|
||||
static int real_pressure_callback(sd_event_source *s, void *userdata) {
|
||||
struct real_pressure_context *c = ASSERT_PTR(userdata);
|
||||
const char *d;
|
||||
|
||||
assert_se(s);
|
||||
assert_se(sd_event_source_get_description(s, &d) >= 0);
|
||||
|
||||
log_notice("real_memory pressure event: %s", d);
|
||||
|
||||
sd_event_trim_memory();
|
||||
|
||||
assert_se(c->pid);
|
||||
assert_se(sd_event_source_send_child_signal(c->pid, SIGKILL, NULL, 0) >= 0);
|
||||
c->pid = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define MMAP_SIZE (10 * 1024 * 1024)
|
||||
|
||||
_noreturn_ static void real_pressure_eat_memory(int pipe_fd) {
|
||||
size_t ate = 0;
|
||||
|
||||
/* Allocates and touches 10M at a time, until runs out of memory */
|
||||
|
||||
char x;
|
||||
assert_se(read(pipe_fd, &x, 1) == 1); /* Wait for the GO! */
|
||||
|
||||
for (;;) {
|
||||
void *p;
|
||||
|
||||
p = mmap(NULL, MMAP_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
|
||||
assert_se(p != MAP_FAILED);
|
||||
|
||||
log_info("Eating another %s.", FORMAT_BYTES(MMAP_SIZE));
|
||||
|
||||
memset(p, random_u32() & 0xFF, MMAP_SIZE);
|
||||
ate += MMAP_SIZE;
|
||||
|
||||
log_info("Ate %s in total.", FORMAT_BYTES(ate));
|
||||
|
||||
usleep(50 * USEC_PER_MSEC);
|
||||
}
|
||||
}
|
||||
|
||||
static int real_pressure_child_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
|
||||
assert_se(s);
|
||||
assert_se(si);
|
||||
|
||||
log_notice("child dead");
|
||||
|
||||
assert_se(si->si_signo == SIGCHLD);
|
||||
assert_se(si->si_status == SIGKILL);
|
||||
assert_se(si->si_code == CLD_KILLED);
|
||||
|
||||
assert_se(sd_event_exit(sd_event_source_get_event(s), 31) >= 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
TEST(real_pressure) {
|
||||
_cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL;
|
||||
_cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
|
||||
_cleanup_(sd_event_source_unrefp) sd_event_source *es = NULL, *cs = NULL;
|
||||
_cleanup_(bus_wait_for_jobs_freep) BusWaitForJobs *w = NULL;
|
||||
_cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
|
||||
_cleanup_close_pair_ int pipe_fd[2] = PIPE_EBADF;
|
||||
_cleanup_(sd_event_unrefp) sd_event *e = NULL;
|
||||
_cleanup_free_ char *scope = NULL;
|
||||
const char *object;
|
||||
int r;
|
||||
pid_t pid;
|
||||
|
||||
r = sd_bus_open_system(&bus);
|
||||
if (r < 0) {
|
||||
log_notice_errno(r, "Can't connect to system bus, skipping test: %m");
|
||||
return;
|
||||
}
|
||||
|
||||
assert_se(bus_wait_for_jobs_new(bus, &w) >= 0);
|
||||
|
||||
assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "StartTransientUnit") >= 0);
|
||||
assert_se(asprintf(&scope, "test-%" PRIu64 ".scope", random_u64()) >= 0);
|
||||
assert_se(sd_bus_message_append(m, "ss", scope, "fail") >= 0);
|
||||
assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0);
|
||||
assert_se(sd_bus_message_append(m, "(sv)", "PIDs", "au", 1, 0) >= 0);
|
||||
assert_se(sd_bus_message_append(m, "(sv)", "MemoryAccounting", "b", true) >= 0);
|
||||
assert_se(sd_bus_message_close_container(m) >= 0);
|
||||
assert_se(sd_bus_message_append(m, "a(sa(sv))", 0) >= 0);
|
||||
|
||||
r = sd_bus_call(bus, m, 0, &error, &reply);
|
||||
if (r < 0) {
|
||||
log_notice_errno(r, "Can't issue transient unit call, skipping test: %m");
|
||||
return;
|
||||
}
|
||||
|
||||
assert_se(sd_bus_message_read(reply, "o", &object) >= 0);
|
||||
|
||||
assert_se(bus_wait_for_jobs_one(w, object, /* quiet= */ false, /* extra_args= */ NULL) >= 0);
|
||||
|
||||
assert_se(sd_event_default(&e) >= 0);
|
||||
|
||||
assert_se(pipe2(pipe_fd, O_CLOEXEC) >= 0);
|
||||
|
||||
r = safe_fork("(eat-memory)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
|
||||
assert_se(r >= 0);
|
||||
if (r == 0) {
|
||||
real_pressure_eat_memory(pipe_fd[0]);
|
||||
_exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, -1) >= 0);
|
||||
assert_se(sd_event_add_child(e, &cs, pid, WEXITED, real_pressure_child_callback, NULL) >= 0);
|
||||
assert_se(sd_event_source_set_child_process_own(cs, true) >= 0);
|
||||
|
||||
assert_se(unsetenv("MEMORY_PRESSURE_WATCH") >= 0);
|
||||
assert_se(unsetenv("MEMORY_PRESSURE_WRITE") >= 0);
|
||||
|
||||
struct real_pressure_context context = {
|
||||
.pid = cs,
|
||||
};
|
||||
|
||||
r = sd_event_add_memory_pressure(e, &es, real_pressure_callback, &context);
|
||||
if (r < 0) {
|
||||
log_notice_errno(r, "Can't allocate memory pressure fd, skipping test: %m");
|
||||
return;
|
||||
}
|
||||
|
||||
assert_se(sd_event_source_set_description(es, "real pressure event source") >= 0);
|
||||
assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0);
|
||||
assert_se(sd_event_source_set_memory_pressure_type(es, "full") > 0);
|
||||
assert_se(sd_event_source_set_memory_pressure_type(es, "full") == 0);
|
||||
assert_se(sd_event_source_set_memory_pressure_type(es, "some") > 0);
|
||||
assert_se(sd_event_source_set_memory_pressure_type(es, "some") == 0);
|
||||
assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) > 0);
|
||||
assert_se(sd_event_source_set_memory_pressure_period(es, 70 * USEC_PER_MSEC, USEC_PER_SEC) == 0);
|
||||
assert_se(sd_event_source_set_enabled(es, SD_EVENT_ONESHOT) >= 0);
|
||||
|
||||
_cleanup_free_ char *uo;
|
||||
assert_se(uo = unit_dbus_path_from_name(scope));
|
||||
|
||||
uint64_t mcurrent = UINT64_MAX;
|
||||
assert_se(sd_bus_get_property_trivial(bus, "org.freedesktop.systemd1", uo, "org.freedesktop.systemd1.Scope", "MemoryCurrent", &error, 't', &mcurrent) >= 0);
|
||||
|
||||
printf("current: %" PRIu64 "\n", mcurrent);
|
||||
if (mcurrent == UINT64_MAX) {
|
||||
log_notice_errno(r, "Memory accounting not available, skipping test: %m");
|
||||
return;
|
||||
}
|
||||
|
||||
m = sd_bus_message_unref(m);
|
||||
|
||||
assert_se(bus_message_new_method_call(bus, &m, bus_systemd_mgr, "SetUnitProperties") >= 0);
|
||||
assert_se(sd_bus_message_append(m, "sb", scope, true) >= 0);
|
||||
assert_se(sd_bus_message_open_container(m, 'a', "(sv)") >= 0);
|
||||
assert_se(sd_bus_message_append(m, "(sv)", "MemoryHigh", "t", mcurrent + (15 * 1024 * 1024)) >= 0);
|
||||
assert_se(sd_bus_message_append(m, "(sv)", "MemoryMax", "t", mcurrent + (50 * 1024 * 1024)) >= 0);
|
||||
assert_se(sd_bus_message_close_container(m) >= 0);
|
||||
|
||||
assert_se(sd_bus_call(bus, m, 0, NULL, NULL) >= 0);
|
||||
|
||||
/* Generate some memory allocations via mempool */
|
||||
#define NN (1024)
|
||||
Hashmap **h = new(Hashmap*, NN);
|
||||
for (int i = 0; i < NN; i++)
|
||||
h[i] = hashmap_new(NULL);
|
||||
for (int i = 0; i < NN; i++)
|
||||
hashmap_free(h[i]);
|
||||
free(h);
|
||||
|
||||
/* Now start eating memory */
|
||||
assert_se(write(pipe_fd[1], &(const char) { 'x' }, 1) == 1);
|
||||
|
||||
assert_se(sd_event_loop(e) >= 0);
|
||||
int ex = 0;
|
||||
assert_se(sd_event_get_exit_code(e, &ex) >= 0);
|
||||
assert_se(ex == 31);
|
||||
}
|
||||
|
||||
static int outro(void) {
|
||||
hashmap_trim_pools();
|
||||
return 0;
|
||||
}
|
||||
|
||||
DEFINE_TEST_MAIN_FULL(LOG_DEBUG, NULL, outro);
|
Loading…
Reference in New Issue
Block a user