1
0
mirror of https://github.com/systemd/systemd.git synced 2025-01-31 05:47:30 +03:00

pid1: add D-Bus API for removing delegated subcgroups

When running unprivileged containers, we run into a scenario where an
unpriv owned cgroup has a subcgroup delegated to another user (i.e. the
container's own UIDs). When the owner of that cgroup dies without
cleaning it up then the unpriv service manager might encounter a cgroup
it cannot delete anymore.

Let's address that: let's expose a method call on the service manager
(primarly in PID1) that can be used to delete a subcgroup of a unit one
owns. This would then allow the unpriv service manager to ask the priv
service manager to get rid of such a cgroup.

This commit only adds the method call, the next commit then adds the
code that makes use of this.
This commit is contained in:
Lennart Poettering 2024-11-12 11:59:40 +01:00
parent 59857b672c
commit 94634b4b03
7 changed files with 182 additions and 4 deletions

View File

@ -147,6 +147,9 @@ node /org/freedesktop/systemd1 {
AttachProcessesToUnit(in s unit_name,
in s subcgroup,
in au pids);
RemoveSubgroupFromUnit(in s unit_name,
in s subcgroup,
in t flags);
AbandonScope(in s name);
GetJob(in u id,
out o job);
@ -870,6 +873,8 @@ node /org/freedesktop/systemd1 {
<variablelist class="dbus-method" generated="True" extra-ref="AttachProcessesToUnit()"/>
<variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroupFromUnit()"/>
<variablelist class="dbus-method" generated="True" extra-ref="AbandonScope()"/>
<variablelist class="dbus-method" generated="True" extra-ref="GetJob()"/>
@ -1599,6 +1604,13 @@ node /org/freedesktop/systemd1 {
parameters. The possible values are <literal>configuration</literal>, <literal>state</literal>,
<literal>logs</literal>, <literal>cache</literal>, <literal>runtime</literal>,
<literal>fdstore</literal>, and <literal>all</literal>.</para>
<para><function>RemoveSubgroupFromUnit()</function> removes a subcgroup belonging to a unit's
cgroup. Takes three arguments: the unit name (if empty defaults to the caller's unit), a cgroup path
(which must start start with a slash <literal>/</literal>), which is taken relative to the unit's
cgroup, and a flags argument (which must be zero for now). This is primarily useful for unprivileged
service managers to ask the system service manager for removal of subcgroups it manages, in case one
was delegated to other UIDs.</para>
</refsect2>
<refsect2>
@ -2704,6 +2716,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
RemoveSubgroup(in s subcgroup,
in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s Type = '...';
@ -3398,6 +3412,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--method AttachProcesses is not documented!-->
<!--method RemoveSubgroup is not documented!-->
<!--property Type is not documented!-->
<!--property ExitType is not documented!-->
@ -4006,6 +4022,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
<variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
<variablelist class="dbus-property" generated="True" extra-ref="Type"/>
<variablelist class="dbus-property" generated="True" extra-ref="ExitType"/>
@ -4901,6 +4919,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
RemoveSubgroup(in s subcgroup,
in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s BindIPv6Only = '...';
@ -5592,6 +5612,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--method AttachProcesses is not documented!-->
<!--method RemoveSubgroup is not documented!-->
<!--property BindIPv6Only is not documented!-->
<!--property Backlog is not documented!-->
@ -6206,6 +6228,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
<variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindIPv6Only"/>
<variablelist class="dbus-property" generated="True" extra-ref="Backlog"/>
@ -7001,6 +7025,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
RemoveSubgroup(in s subcgroup,
in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly s Where = '...';
@ -7601,6 +7627,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--method AttachProcesses is not documented!-->
<!--method RemoveSubgroup is not documented!-->
<!--property Where is not documented!-->
<!--property What is not documented!-->
@ -8141,6 +8169,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
<variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
<variablelist class="dbus-property" generated="True" extra-ref="Where"/>
<variablelist class="dbus-property" generated="True" extra-ref="What"/>
@ -8991,6 +9021,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
RemoveSubgroup(in s subcgroup,
in t flags);
properties:
readonly s What = '...';
readonly i Priority = ...;
@ -9577,6 +9609,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--method AttachProcesses is not documented!-->
<!--method RemoveSubgroup is not documented!-->
<!--property What is not documented!-->
<!--property Priority is not documented!-->
@ -10103,6 +10137,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
<variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
<variablelist class="dbus-property" generated="True" extra-ref="What"/>
<variablelist class="dbus-property" generated="True" extra-ref="Priority"/>
@ -10805,6 +10841,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
RemoveSubgroup(in s subcgroup,
in t flags);
properties:
@org.freedesktop.DBus.Property.EmitsChangedSignal("false")
readonly s Slice = '...';
@ -11004,6 +11042,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<!--method AttachProcesses is not documented!-->
<!--method RemoveSubgroup is not documented!-->
<!--property Slice is not documented!-->
<!--property ControlGroupId is not documented!-->
@ -11196,6 +11236,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice {
<variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
<variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
<variablelist class="dbus-property" generated="True" extra-ref="Slice"/>
<variablelist class="dbus-property" generated="True" extra-ref="ControlGroup"/>
@ -11411,6 +11453,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
GetProcesses(out a(sus) processes);
AttachProcesses(in s subcgroup,
in au pids);
RemoveSubgroup(in s subcgroup,
in t flags);
signals:
RequestStop();
properties:
@ -11636,6 +11680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<!--method AttachProcesses is not documented!-->
<!--method RemoveSubgroup is not documented!-->
<!--property RuntimeMaxUSec is not documented!-->
<!--property RuntimeRandomizedExtraUSec is not documented!-->
@ -11850,6 +11896,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope {
<variablelist class="dbus-method" generated="True" extra-ref="AttachProcesses()"/>
<variablelist class="dbus-method" generated="True" extra-ref="RemoveSubgroup()"/>
<variablelist class="dbus-signal" generated="True" extra-ref="RequestStop()"/>
<variablelist class="dbus-property" generated="True" extra-ref="Controller"/>
@ -12254,6 +12302,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ShutdownStartTimestamp</varname>,
<varname>ShutdownStartTimestampMonotonic</varname>, and
<varname>SoftRebootsCount</varname> were added in version 256.</para>
<para><function>RemoveSubgroupFromUnit()</function> was added in version 258.</para>
</refsect2>
<refsect2>
<title>Unit Objects</title>
@ -12320,7 +12369,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ProtectControlGroupsEx</varname>,
<varname>PrivateUsersEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
<para><varname>ProtectHostnameEx</varname> and <function>RemoveSubGroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Socket Unit Objects</title>
@ -12364,7 +12413,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
<para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Mount Unit Objects</title>
@ -12405,7 +12454,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
<para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> was added in version 258.</para>
</refsect2>
<refsect2>
<title>Swap Unit Objects</title>
@ -12446,7 +12495,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>ManagedOOMMemoryPressureDurationUSec</varname>,
<varname>ProtectControlGroupsEx</varname>, and
<varname>PrivatePIDs</varname> were added in version 257.</para>
<para><varname>ProtectHostnameEx</varname> was added in version 258.</para>
<para><varname>ProtectHostnameEx</varname> and <function>RemoveSubgroup()</function> were added in version 258.</para>
</refsect2>
<refsect2>
<title>Slice Unit Objects</title>
@ -12472,6 +12521,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>EffectiveTasksMax</varname>, and
<varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
<para><varname>ManagedOOMMemoryPressureDurationUSec</varname> was added in version 257.</para>
<para><function>RemoveSubgroup()</function> was added in version 258.</para>
</refsect2>
<refsect2>
<title>Scope Unit Objects</title>
@ -12498,6 +12548,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \
<varname>EffectiveTasksMax</varname>, and
<varname>MemoryZSwapWriteback</varname> were added in version 256.</para>
<para><varname>ManagedOOMMemoryPressureDurationUSec</varname> was added in version 257.</para>
<para><function>RemoveSubgroup()</function> was added in version 258.</para>
</refsect2>
<refsect2>
<title>Job Objects</title>

View File

@ -3126,6 +3126,49 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) {
return ret;
}
int unit_remove_subcgroup(Unit *u, const char *suffix_path) {
int r;
assert(u);
if (!UNIT_HAS_CGROUP_CONTEXT(u))
return -EINVAL;
if (!unit_cgroup_delegate(u))
return -ENOMEDIUM;
r = unit_pick_cgroup_path(u);
if (r < 0)
return r;
CGroupRuntime *crt = unit_get_cgroup_runtime(u);
if (!crt || !crt->cgroup_path)
return -EOWNERDEAD;
_cleanup_free_ char *j = NULL;
bool delete_root;
const char *d;
if (empty_or_root(suffix_path)) {
d = empty_to_root(crt->cgroup_path);
delete_root = false; /* Don't attempt to delete the main cgroup of this unit */
} else {
j = path_join(crt->cgroup_path, suffix_path);
if (!j)
return -ENOMEM;
d = j;
delete_root = true;
}
log_unit_debug(u, "Removing subcgroup '%s'...", d);
r = cg_trim_everywhere(u->manager->cgroup_supported, d, delete_root);
if (r < 0)
return log_unit_debug_errno(u, r, "Failed to fully %s cgroup '%s': %m", delete_root ? "remove" : "trim", d);
return 0;
}
static bool unit_has_mask_realized(
Unit *u,
CGroupMask target_mask,

View File

@ -456,6 +456,7 @@ int unit_check_oomd_kill(Unit *u);
int unit_check_oom(Unit *u);
int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path);
int unit_remove_subcgroup(Unit *u, const char *suffix_path);
int manager_setup_cgroup(Manager *m);
void manager_shutdown_cgroup(Manager *m, bool delete);

View File

@ -960,6 +960,12 @@ static int method_attach_processes_to_unit(sd_bus_message *message, void *userda
return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED);
}
static int method_remove_subgroup_from_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) {
/* Don't allow removal of subgroups from units that aren't loaded. But allow loading the unit, since
* this is clean-up work, that is OK to do when the unit is stopped already. */
return method_generic_unit_operation(message, userdata, error, bus_unit_method_remove_subgroup, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED);
}
static int transient_unit_from_message(
Manager *m,
sd_bus_message *message,
@ -3246,6 +3252,11 @@ const sd_bus_vtable bus_manager_vtable[] = {
SD_BUS_NO_RESULT,
method_attach_processes_to_unit,
SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD_WITH_ARGS("RemoveSubgroupFromUnit",
SD_BUS_ARGS("s", unit_name, "s", subcgroup, "t", flags),
SD_BUS_NO_RESULT,
method_remove_subgroup_from_unit,
SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD_WITH_ARGS("AbandonScope",
SD_BUS_ARGS("s", name),
SD_BUS_NO_RESULT,

View File

@ -1594,6 +1594,59 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd
return sd_bus_reply_method_return(message, NULL);
}
int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error) {
Unit *u = ASSERT_PTR(userdata);
int r;
assert(message);
/* This removes a subcgroup of the unit, regardless which user owns the subcgroup. This is useful
* when cgroup delegation is enabled for a unit, and the unit subdelegates the cgroup further */
r = mac_selinux_unit_access_check(u, message, "stop", error);
if (r < 0)
return r;
const char *path;
uint64_t flags;
r = sd_bus_message_read(message, "st", &path, &flags);
if (r < 0)
return r;
/* No flags defined for now. */
if (flags != 0)
return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, "Invalid 'flags' parameter '%" PRIu64 "'", flags);
if (!unit_cgroup_delegate(u))
return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Subcgroup removal not available on non-delegated units.");
if (!path_is_absolute(path))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path);
if (!path_is_normalized(path))
return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path);
_cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL;
r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds);
if (r < 0)
return r;
uid_t sender_uid;
r = sd_bus_creds_get_euid(creds, &sender_uid);
if (r < 0)
return r;
/* Allow this only if the client is privileged, is us, or is the user of the unit itself. */
if (sender_uid != 0 && sender_uid != getuid() && sender_uid != u->ref_uid)
return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Client is not permitted to alter cgroup.");
r = unit_remove_subcgroup(u, path);
if (r < 0)
return sd_bus_error_set_errnof(error, r, "Failed to remove subgroup %s: %m", path);
return sd_bus_reply_method_return(message, NULL);
}
const sd_bus_vtable bus_unit_cgroup_vtable[] = {
SD_BUS_VTABLE_START(0),
SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0),
@ -1633,6 +1686,12 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = {
bus_unit_method_attach_processes,
SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_METHOD_WITH_ARGS("RemoveSubgroup",
SD_BUS_ARGS("s", subcgroup, "t", flags),
SD_BUS_NO_RESULT,
bus_unit_method_remove_subgroup,
SD_BUS_VTABLE_UNPRIVILEGED),
SD_BUS_VTABLE_END
};

View File

@ -22,6 +22,7 @@ int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags fla
int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error);
int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error);

View File

@ -274,6 +274,10 @@
send_interface="org.freedesktop.systemd1.Manager"
send_member="AttachProcessesToUnit"/>
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Manager"
send_member="RemoveSubgroupFromUnit"/>
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Manager"
send_member="CancelJob"/>
@ -432,6 +436,10 @@
send_interface="org.freedesktop.systemd1.Service"
send_member="AttachProcesses"/>
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Service"
send_member="RemoveSubgroupFromUnit"/>
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Service"
send_member="BindMount"/>
@ -446,6 +454,10 @@
send_interface="org.freedesktop.systemd1.Scope"
send_member="AttachProcesses"/>
<allow send_destination="org.freedesktop.systemd1"
send_interface="org.freedesktop.systemd1.Service"
send_member="RemoveSubgroupFromUnit"/>
<allow receive_sender="org.freedesktop.systemd1"/>
</policy>