1
0
mirror of https://github.com/systemd/systemd.git synced 2024-12-22 17:35:35 +03:00

core/manager: add dbus API to create auxiliary scope from running service

This commit introduces new D-Bus API, StartAuxiliaryScope(). It may be
used by services as part of the restart procedure. Service sends an
array of PID file descriptors corresponding to processes that are part
of the service and must continue running also after service restarts,
i.e. they haven't finished the job why they were spawned in the first
place (e.g. long running video transcoding job). Systemd creates new
scope unit for these processes and migrates them into it. Cgroup
properties of scope are copied from the service so it retains same
cgroup settings and limits as service had.
This commit is contained in:
Michal Sekletar 2023-10-30 12:08:59 +01:00
parent 04d4086c22
commit 84c01612de
4 changed files with 516 additions and 0 deletions

View File

@ -277,6 +277,11 @@ node /org/freedesktop/systemd1 {
GetDynamicUsers(out a(us) users);
DumpUnitFileDescriptorStore(in s name,
out a(suuutuusu) entries);
StartAuxiliaryScope(in s name,
in ah pidfds,
in t flags,
in a(sv) properties,
out o job);
signals:
UnitNew(s id,
o unit);
@ -990,6 +995,8 @@ node /org/freedesktop/systemd1 {
<variablelist class="dbus-method" generated="True" extra-ref="DumpUnitFileDescriptorStore()"/>
<variablelist class="dbus-method" generated="True" extra-ref="StartAuxiliaryScope()"/>
<variablelist class="dbus-signal" generated="True" extra-ref="UnitNew"/>
<variablelist class="dbus-signal" generated="True" extra-ref="UnitRemoved"/>
@ -1567,6 +1574,13 @@ node /org/freedesktop/systemd1 {
file descriptors currently in the file descriptor store of the specified unit. This call is equivalent
to <function>DumpFileDescriptorStore()</function> on the
<interfacename>org.freedesktop.systemd1.Service</interfacename>. For further details, see below.</para>
<para><function>StartAuxiliaryScope()</function> creates a new scope unit from a service where calling
process resides. Set of processes that will be migrated to newly created scope is passed in as an array
of pidfds. This is useful for creating auxiliary scopes that should contain worker processes and their lifecycle
shouldn't be bound to a lifecycle of the service, e.g. they should continue running after the restart
of the service. Note that the main PID of the service can not be migrated to an auxiliary scope.
Also, <varname>flags</varname> argument must be 0 and is reserved for future extensions.</para>
</refsect2>
<refsect2>
@ -11826,6 +11840,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<function>QueueSignalUnit()</function>,
<function>SoftReboot()</function>, and
<function>DumpUnitFileDescriptorStore()</function> were added in version 254.</para>
<para><function>StartAuxiliaryScope()</function> was added in version 256.</para>
</refsect2>
<refsect2>
<title>Unit Objects</title>

View File

@ -33,6 +33,7 @@
#include "process-util.h"
#include "procfs-util.h"
#include "restrict-ifaces.h"
#include "set.h"
#include "special.h"
#include "stdio-util.h"
#include "string-table.h"
@ -189,6 +190,313 @@ void cgroup_context_init(CGroupContext *c) {
};
}
int cgroup_context_add_io_device_weight_dup(CGroupContext *c, CGroupIODeviceWeight *w) {
_cleanup_free_ CGroupIODeviceWeight *n = NULL;
assert(c);
assert(w);
n = new0(CGroupIODeviceWeight, 1);
if (!n)
return -ENOMEM;
n->path = strdup(w->path);
if (!n->path)
return -ENOMEM;
n->weight = w->weight;
LIST_PREPEND(device_weights, c->io_device_weights, TAKE_PTR(n));
return 0;
}
int cgroup_context_add_io_device_limit_dup(CGroupContext *c, CGroupIODeviceLimit *l) {
_cleanup_free_ CGroupIODeviceLimit *n = NULL;
assert(c);
assert(l);
n = new0(CGroupIODeviceLimit, 1);
if (!l)
return -ENOMEM;
n->path = strdup(l->path);
if (!n->path)
return -ENOMEM;
for (CGroupIOLimitType type = 0; type < _CGROUP_IO_LIMIT_TYPE_MAX; type++)
n->limits[type] = l->limits[type];
LIST_PREPEND(device_limits, c->io_device_limits, TAKE_PTR(n));
return 0;
}
int cgroup_context_add_io_device_latency_dup(CGroupContext *c, CGroupIODeviceLatency *l) {
_cleanup_free_ CGroupIODeviceLatency *n = NULL;
assert(c);
assert(l);
n = new0(CGroupIODeviceLatency, 1);
if (!n)
return -ENOMEM;
n->path = strdup(l->path);
if (!n->path)
return -ENOMEM;
n->target_usec = l->target_usec;
LIST_PREPEND(device_latencies, c->io_device_latencies, TAKE_PTR(n));
return 0;
}
int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, CGroupBlockIODeviceWeight *w) {
_cleanup_free_ CGroupBlockIODeviceWeight *n = NULL;
assert(c);
assert(w);
n = new0(CGroupBlockIODeviceWeight, 1);
if (!n)
return -ENOMEM;
n->path = strdup(w->path);
if (!n->path)
return -ENOMEM;
n->weight = w->weight;
LIST_PREPEND(device_weights, c->blockio_device_weights, TAKE_PTR(n));
return 0;
}
int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, CGroupBlockIODeviceBandwidth *b) {
_cleanup_free_ CGroupBlockIODeviceBandwidth *n = NULL;
assert(c);
assert(b);
n = new0(CGroupBlockIODeviceBandwidth, 1);
if (!n)
return -ENOMEM;
*n = (CGroupBlockIODeviceBandwidth) {
.rbps = b->rbps,
.wbps = b->wbps,
};
LIST_PREPEND(device_bandwidths, c->blockio_device_bandwidths, TAKE_PTR(n));
return 0;
}
int cgroup_context_add_device_allow_dup(CGroupContext *c, CGroupDeviceAllow *a) {
_cleanup_free_ CGroupDeviceAllow *n = NULL;
assert(c);
assert(a);
n = new0(CGroupDeviceAllow, 1);
if (!n)
return -ENOMEM;
n->path = strdup(a->path);
if (!n->path)
return -ENOMEM;
n->permissions = a->permissions;
LIST_PREPEND(device_allow, c->device_allow, TAKE_PTR(n));
return 0;
}
static int cgroup_context_add_socket_bind_item_dup(CGroupContext *c, CGroupSocketBindItem *i, CGroupSocketBindItem *h) {
_cleanup_free_ CGroupSocketBindItem *n = NULL;
assert(c);
assert(i);
n = new0(CGroupSocketBindItem, 1);
if (!n)
return -ENOMEM;
*n = (CGroupSocketBindItem) {
.address_family = i->address_family,
.ip_protocol = i->ip_protocol,
.nr_ports = i->nr_ports,
.port_min = i->port_min,
};
LIST_PREPEND(socket_bind_items, h, TAKE_PTR(n));
return 0;
}
int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, CGroupSocketBindItem *i) {
return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_allow);
}
int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, CGroupSocketBindItem *i) {
return cgroup_context_add_socket_bind_item_dup(c, i, c->socket_bind_deny);
}
int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src) {
struct in_addr_prefix *i;
char *iface;
int r;
assert(src);
assert(dst);
dst->cpu_accounting = src->cpu_accounting;
dst->io_accounting = src->io_accounting;
dst->blockio_accounting = src->blockio_accounting;
dst->memory_accounting = src->memory_accounting;
dst->tasks_accounting = src->tasks_accounting;
dst->ip_accounting = src->ip_accounting;
dst->memory_oom_group = dst->memory_oom_group;
dst->cpu_weight = src->cpu_weight;
dst->startup_cpu_weight = src->startup_cpu_weight;
dst->cpu_quota_per_sec_usec = src->cpu_quota_per_sec_usec;
dst->cpu_quota_period_usec = src->cpu_quota_period_usec;
dst->cpuset_cpus = src->cpuset_cpus;
dst->startup_cpuset_cpus = src->startup_cpuset_cpus;
dst->cpuset_mems = src->cpuset_mems;
dst->startup_cpuset_mems = src->startup_cpuset_mems;
dst->io_weight = src->io_weight;
dst->startup_io_weight = src->startup_io_weight;
LIST_FOREACH_BACKWARDS(device_weights, w, LIST_FIND_TAIL(device_weights, src->io_device_weights)) {
r = cgroup_context_add_io_device_weight_dup(dst, w);
if (r < 0)
return r;
}
LIST_FOREACH_BACKWARDS(device_limits, l, LIST_FIND_TAIL(device_limits, src->io_device_limits)) {
r = cgroup_context_add_io_device_limit_dup(dst, l);
if (r < 0)
return r;
}
LIST_FOREACH_BACKWARDS(device_latencies, l, LIST_FIND_TAIL(device_latencies, src->io_device_latencies)) {
r = cgroup_context_add_io_device_latency_dup(dst, l);
if (r < 0)
return r;
}
dst->default_memory_min = src->default_memory_min;
dst->default_memory_low = src->default_memory_low;
dst->default_startup_memory_low = src->default_startup_memory_low;
dst->memory_min = src->memory_min;
dst->memory_low = src->memory_low;
dst->startup_memory_low = src->startup_memory_low;
dst->memory_high = src->memory_high;
dst->startup_memory_high = src->startup_memory_high;
dst->memory_max = src->memory_max;
dst->startup_memory_max = src->startup_memory_max;
dst->memory_swap_max = src->memory_swap_max;
dst->startup_memory_swap_max = src->startup_memory_swap_max;
dst->memory_zswap_max = src->memory_zswap_max;
dst->startup_memory_zswap_max = src->startup_memory_zswap_max;
dst->default_memory_min_set = src->default_memory_min_set;
dst->default_memory_low_set = src->default_memory_low_set;
dst->default_startup_memory_low_set = src->default_startup_memory_low_set;
dst->memory_min_set = src->memory_min_set;
dst->memory_low_set = src->memory_low_set;
dst->startup_memory_low_set = src->startup_memory_low_set;
dst->startup_memory_high_set = src->startup_memory_high_set;
dst->startup_memory_max_set = src->startup_memory_max_set;
dst->startup_memory_swap_max_set = src->startup_memory_swap_max_set;
dst->startup_memory_zswap_max_set = src->startup_memory_zswap_max_set;
SET_FOREACH(i, src->ip_address_allow) {
r = in_addr_prefix_add(&dst->ip_address_allow, i);
if (r < 0)
return r;
}
SET_FOREACH(i, src->ip_address_deny) {
r = in_addr_prefix_add(&dst->ip_address_deny, i);
if (r < 0)
return r;
}
dst->ip_address_allow_reduced = src->ip_address_allow_reduced;
dst->ip_address_deny_reduced = src->ip_address_deny_reduced;
if (!strv_isempty(src->ip_filters_ingress)) {
dst->ip_filters_ingress = strv_copy(src->ip_filters_ingress);
if (!dst->ip_filters_ingress)
return -ENOMEM;
}
if (!strv_isempty(src->ip_filters_egress)) {
dst->ip_filters_egress = strv_copy(src->ip_filters_egress);
if (!dst->ip_filters_egress)
return -ENOMEM;
}
LIST_FOREACH_BACKWARDS(programs, l, LIST_FIND_TAIL(programs, src->bpf_foreign_programs)) {
r = cgroup_context_add_bpf_foreign_program_dup(dst, l);
if (r < 0)
return r;
}
SET_FOREACH(iface, src->restrict_network_interfaces) {
r = set_put_strdup(&dst->restrict_network_interfaces, iface);
if (r < 0)
return r;
}
dst->restrict_network_interfaces_is_allow_list = src->restrict_network_interfaces_is_allow_list;
dst->cpu_shares = src->cpu_shares;
dst->startup_cpu_shares = src->startup_cpu_shares;
dst->blockio_weight = src->blockio_weight;
dst->startup_blockio_weight = src->startup_blockio_weight;
LIST_FOREACH_BACKWARDS(device_weights, l, LIST_FIND_TAIL(device_weights, src->blockio_device_weights)) {
r = cgroup_context_add_block_io_device_weight_dup(dst, l);
if (r < 0)
return r;
}
LIST_FOREACH_BACKWARDS(device_bandwidths, l, LIST_FIND_TAIL(device_bandwidths, src->blockio_device_bandwidths)) {
r = cgroup_context_add_block_io_device_bandwidth_dup(dst, l);
if (r < 0)
return r;
}
dst->memory_limit = src->memory_limit;
dst->device_policy = src->device_policy;
LIST_FOREACH_BACKWARDS(device_allow, l, LIST_FIND_TAIL(device_allow, src->device_allow)) {
r = cgroup_context_add_device_allow_dup(dst, l);
if (r < 0)
return r;
}
LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_allow)) {
r = cgroup_context_add_socket_bind_item_allow_dup(dst, l);
if (r < 0)
return r;
}
LIST_FOREACH_BACKWARDS(socket_bind_items, l, LIST_FIND_TAIL(socket_bind_items, src->socket_bind_deny)) {
r = cgroup_context_add_socket_bind_item_deny_dup(dst, l);
if (r < 0)
return r;
}
dst->tasks_max = src->tasks_max;
return 0;
}
void cgroup_context_free_device_allow(CGroupContext *c, CGroupDeviceAllow *a) {
assert(c);
assert(a);

View File

@ -129,6 +129,7 @@ typedef enum CGroupPressureWatch {
_CGROUP_PRESSURE_WATCH_INVALID = -EINVAL,
} CGroupPressureWatch;
/* When adding members make sure to update cgroup_context_copy() accordingly */
struct CGroupContext {
bool cpu_accounting;
bool io_accounting;
@ -285,6 +286,7 @@ uint64_t cgroup_context_cpu_weight(CGroupContext *c, ManagerState state);
usec_t cgroup_cpu_adjust_period(usec_t period, usec_t quota, usec_t resolution, usec_t max_period);
void cgroup_context_init(CGroupContext *c);
int cgroup_context_copy(CGroupContext *dst, const CGroupContext *src);
void cgroup_context_done(CGroupContext *c);
void cgroup_context_dump(Unit *u, FILE* f, const char *prefix);
void cgroup_context_dump_socket_bind_item(const CGroupSocketBindItem *item, FILE *f);
@ -309,6 +311,18 @@ static inline bool cgroup_context_want_memory_pressure(const CGroupContext *c) {
int cgroup_context_add_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
int cgroup_context_add_or_update_device_allow(CGroupContext *c, const char *dev, CGroupDevicePermissions p);
int cgroup_context_add_bpf_foreign_program(CGroupContext *c, uint32_t attach_type, const char *path);
int cgroup_context_add_io_device_limit_dup(CGroupContext *c, CGroupIODeviceLimit *l);
int cgroup_context_add_io_device_weight_dup(CGroupContext *c, CGroupIODeviceWeight *w);
int cgroup_context_add_io_device_latency_dup(CGroupContext *c, CGroupIODeviceLatency *l);
int cgroup_context_add_block_io_device_weight_dup(CGroupContext *c, CGroupBlockIODeviceWeight *w);
int cgroup_context_add_block_io_device_bandwidth_dup(CGroupContext *c, CGroupBlockIODeviceBandwidth *b);
int cgroup_context_add_device_allow_dup(CGroupContext *c, CGroupDeviceAllow *a);
int cgroup_context_add_socket_bind_item_allow_dup(CGroupContext *c, CGroupSocketBindItem *i);
int cgroup_context_add_socket_bind_item_deny_dup(CGroupContext *c, CGroupSocketBindItem *i);
static inline int cgroup_context_add_bpf_foreign_program_dup(CGroupContext *c, CGroupBPFForeignProgram *p) {
return cgroup_context_add_bpf_foreign_program(c, p->attach_type, p->bpffs_path);
}
void unit_modify_nft_set(Unit *u, bool add);

View File

@ -2933,6 +2933,180 @@ static int method_dump_unit_descriptor_store(sd_bus_message *message, void *user
return method_generic_unit_operation(message, userdata, error, bus_service_method_dump_file_descriptor_store, 0);
}
static int aux_scope_from_message(Manager *m, sd_bus_message *message, Unit **ret_scope, sd_bus_error *error) {
_cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
_cleanup_free_ PidRef *pidrefs = NULL;
const char *name;
Unit *from, *scope;
PidRef *main_pid;
CGroupContext *cc;
size_t n_pids = 0;
uint64_t flags;
pid_t pid;
int r;
assert(ret_scope);
r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_PID, &creds);
if (r < 0)
return r;
r = sd_bus_creds_get_pid(creds, &pid);
if (r < 0)
return r;
from = manager_get_unit_by_pid(m, pid);
if (!from)
return sd_bus_error_set(error, BUS_ERROR_NO_SUCH_UNIT, "Client not member of any unit.");
if (!IN_SET(from->type, UNIT_SERVICE, UNIT_SCOPE))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
"Starting auxiliary scope is supported only for service and scope units, refusing.");
if (!unit_name_is_valid(from->id, UNIT_NAME_PLAIN))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
"Auxiliary scope can be started only for non-template service units and scope units, refusing.");
r = sd_bus_message_read(message, "s", &name);
if (r < 0)
return r;
if (!unit_name_is_valid(name, UNIT_NAME_PLAIN))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
"Invalid name \"%s\" for auxiliary scope.", name);
if (unit_name_to_type(name) != UNIT_SCOPE)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS,
"Name \"%s\" of auxiliary scope doesn't have .scope suffix.", name);
main_pid = unit_main_pid(from);
r = sd_bus_message_enter_container(message, 'a', "h");
if (r < 0)
return r;
for (;;) {
_cleanup_(pidref_done) PidRef p = PIDREF_NULL;
Unit *unit;
int fd;
r = sd_bus_message_read(message, "h", &fd);
if (r < 0)
return r;
if (r == 0)
break;
r = pidref_set_pidfd(&p, fd);
if (r < 0) {
log_unit_warning_errno(from, r, "Failed to create process reference from PIDFD, ignoring: %m");
continue;
}
unit = manager_get_unit_by_pidref(m, &p);
if (!unit) {
log_unit_warning_errno(from, SYNTHETIC_ERRNO(ENOENT), "Failed to get unit from PIDFD, ingoring: %m");
continue;
}
if (!streq(unit->id, from->id)) {
log_unit_warning(from, "PID " PID_FMT " is not running in the same service as the calling process, ignoring.", p.pid);
continue;
}
if (pidref_equal(main_pid, &p)) {
log_unit_warning(from, "Main PID cannot be migrated into auxiliary scope, ignoring.");
continue;
}
if (!GREEDY_REALLOC(pidrefs, n_pids+1))
return -ENOMEM;
pidrefs[n_pids++] = TAKE_PIDREF(p);
}
if (n_pids == 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "No processes can be migrated to auxiliary scope.");
r = sd_bus_message_exit_container(message);
if (r < 0)
return r;
r = sd_bus_message_read(message, "t", &flags);
if (r < 0)
return r;
if (flags != 0)
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Flags must be zero.");
r = manager_load_unit(m, name, NULL, error, &scope);
if (r < 0)
return r;
if (!unit_is_pristine(scope))
return sd_bus_error_setf(error, BUS_ERROR_UNIT_EXISTS,
"Unit %s was already loaded or has a fragment file.", name);
r = unit_set_slice(scope, UNIT_GET_SLICE(from));
if (r < 0)
return r;
cc = unit_get_cgroup_context(scope);
r = cgroup_context_copy(cc, unit_get_cgroup_context(from));
if (r < 0)
return r;
r = unit_make_transient(scope);
if (r < 0)
return r;
r = bus_unit_set_properties(scope, message, UNIT_RUNTIME, true, error);
if (r < 0)
return r;
FOREACH_ARRAY(p, pidrefs, n_pids) {
r = unit_pid_attachable(scope, p, error);
if (r < 0)
return r;
r = unit_watch_pidref(scope, p, /* exclusive= */ false);
if (r < 0 && r != -EEXIST)
return r;
}
/* Now load the missing bits of the unit we just created */
unit_add_to_load_queue(scope);
manager_dispatch_load_queue(m);
*ret_scope = TAKE_PTR(scope);
return 1;
}
static int method_start_aux_scope(sd_bus_message *message, void *userdata, sd_bus_error *error) {
Manager *m = ASSERT_PTR(userdata);
Unit *u = NULL; /* avoid false maybe-uninitialized warning */
int r;
assert(message);
r = mac_selinux_access_check(message, "start", error);
if (r < 0)
return r;
r = bus_verify_manage_units_async(m, message, error);
if (r < 0)
return r;
if (r == 0)
return 1; /* No authorization for now, but the async polkit stuff will call us again when it has it */
r = aux_scope_from_message(m, message, &u, error);
if (r < 0)
return r;
return bus_unit_queue_job(message, u, JOB_START, JOB_REPLACE, 0, error);
}
const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_VTABLE_START(0),
@ -3491,6 +3665,11 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_RESULT("a(suuutuusu)", entries),
method_dump_unit_descriptor_store,
SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD_WITH_ARGS("StartAuxiliaryScope",
SD_BUS_ARGS("s", name, "ah", pidfds, "t", flags, "a(sv)", properties),
SD_BUS_RESULT("o", job),
method_start_aux_scope,
SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_SIGNAL_WITH_ARGS("UnitNew",
SD_BUS_ARGS("s", id, "o", unit),