From 94634b4b0312e4cdb88e37cbbb4f2ec9273cb28a Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Tue, 12 Nov 2024 11:59:40 +0100 Subject: [PATCH] pid1: add D-Bus API for removing delegated subcgroups When running unprivileged containers, we run into a scenario where an unpriv owned cgroup has a subcgroup delegated to another user (i.e. the container's own UIDs). When the owner of that cgroup dies without cleaning it up then the unpriv service manager might encounter a cgroup it cannot delete anymore. Let's address that: let's expose a method call on the service manager (primarly in PID1) that can be used to delete a subcgroup of a unit one owns. This would then allow the unpriv service manager to ask the priv service manager to get rid of such a cgroup. This commit only adds the method call, the next commit then adds the code that makes use of this. --- man/org.freedesktop.systemd1.xml | 59 ++++++++++++++++++++++++-- src/core/cgroup.c | 43 +++++++++++++++++++ src/core/cgroup.h | 1 + src/core/dbus-manager.c | 11 +++++ src/core/dbus-unit.c | 59 ++++++++++++++++++++++++++ src/core/dbus-unit.h | 1 + src/core/org.freedesktop.systemd1.conf | 12 ++++++ 7 files changed, 182 insertions(+), 4 deletions(-) diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index 1c5e7f2eb75..b95f68b4a55 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -147,6 +147,9 @@ node /org/freedesktop/systemd1 { AttachProcessesToUnit(in s unit_name, in s subcgroup, in au pids); + RemoveSubgroupFromUnit(in s unit_name, + in s subcgroup, + in t flags); AbandonScope(in s name); GetJob(in u id, out o job); @@ -870,6 +873,8 @@ node /org/freedesktop/systemd1 { + + @@ -1599,6 +1604,13 @@ node /org/freedesktop/systemd1 { parameters. The possible values are configuration, state, logs, cache, runtime, fdstore, and all. + + RemoveSubgroupFromUnit() removes a subcgroup belonging to a unit's + cgroup. Takes three arguments: the unit name (if empty defaults to the caller's unit), a cgroup path + (which must start start with a slash /), which is taken relative to the unit's + cgroup, and a flags argument (which must be zero for now). This is primarily useful for unprivileged + service managers to ask the system service manager for removal of subcgroups it manages, in case one + was delegated to other UIDs. @@ -2704,6 +2716,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { GetProcesses(out a(sus) processes); AttachProcesses(in s subcgroup, in au pids); + RemoveSubgroup(in s subcgroup, + in t flags); properties: @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s Type = '...'; @@ -3398,6 +3412,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4006,6 +4022,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + @@ -4901,6 +4919,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { GetProcesses(out a(sus) processes); AttachProcesses(in s subcgroup, in au pids); + RemoveSubgroup(in s subcgroup, + in t flags); properties: @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s BindIPv6Only = '...'; @@ -5592,6 +5612,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -6206,6 +6228,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + @@ -7001,6 +7025,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { GetProcesses(out a(sus) processes); AttachProcesses(in s subcgroup, in au pids); + RemoveSubgroup(in s subcgroup, + in t flags); properties: @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s Where = '...'; @@ -7601,6 +7627,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8141,6 +8169,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + @@ -8991,6 +9021,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { GetProcesses(out a(sus) processes); AttachProcesses(in s subcgroup, in au pids); + RemoveSubgroup(in s subcgroup, + in t flags); properties: readonly s What = '...'; readonly i Priority = ...; @@ -9577,6 +9609,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10103,6 +10137,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + @@ -10805,6 +10841,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { GetProcesses(out a(sus) processes); AttachProcesses(in s subcgroup, in au pids); + RemoveSubgroup(in s subcgroup, + in t flags); properties: @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly s Slice = '...'; @@ -11004,6 +11042,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -11196,6 +11236,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + @@ -11411,6 +11453,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { GetProcesses(out a(sus) processes); AttachProcesses(in s subcgroup, in au pids); + RemoveSubgroup(in s subcgroup, + in t flags); signals: RequestStop(); properties: @@ -11636,6 +11680,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -11850,6 +11896,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + @@ -12254,6 +12302,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ShutdownStartTimestamp, ShutdownStartTimestampMonotonic, and SoftRebootsCount were added in version 256. + RemoveSubgroupFromUnit() was added in version 258. Unit Objects @@ -12320,7 +12369,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ProtectControlGroupsEx, PrivateUsersEx, and PrivatePIDs were added in version 257. - ProtectHostnameEx was added in version 258. + ProtectHostnameEx and RemoveSubGroup() were added in version 258. Socket Unit Objects @@ -12364,7 +12413,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMMemoryPressureDurationUSec, ProtectControlGroupsEx, and PrivatePIDs were added in version 257. - ProtectHostnameEx was added in version 258. + ProtectHostnameEx and RemoveSubgroup() were added in version 258. Mount Unit Objects @@ -12405,7 +12454,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMMemoryPressureDurationUSec, ProtectControlGroupsEx, and PrivatePIDs were added in version 257. - ProtectHostnameEx was added in version 258. + ProtectHostnameEx and RemoveSubgroup() was added in version 258. Swap Unit Objects @@ -12446,7 +12495,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ ManagedOOMMemoryPressureDurationUSec, ProtectControlGroupsEx, and PrivatePIDs were added in version 257. - ProtectHostnameEx was added in version 258. + ProtectHostnameEx and RemoveSubgroup() were added in version 258. Slice Unit Objects @@ -12472,6 +12521,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ EffectiveTasksMax, and MemoryZSwapWriteback were added in version 256. ManagedOOMMemoryPressureDurationUSec was added in version 257. + RemoveSubgroup() was added in version 258. Scope Unit Objects @@ -12498,6 +12548,7 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ EffectiveTasksMax, and MemoryZSwapWriteback were added in version 256. ManagedOOMMemoryPressureDurationUSec was added in version 257. + RemoveSubgroup() was added in version 258. Job Objects diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 6933aae54de..345dc5cfbbd 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -3126,6 +3126,49 @@ int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path) { return ret; } +int unit_remove_subcgroup(Unit *u, const char *suffix_path) { + int r; + + assert(u); + + if (!UNIT_HAS_CGROUP_CONTEXT(u)) + return -EINVAL; + + if (!unit_cgroup_delegate(u)) + return -ENOMEDIUM; + + r = unit_pick_cgroup_path(u); + if (r < 0) + return r; + + CGroupRuntime *crt = unit_get_cgroup_runtime(u); + if (!crt || !crt->cgroup_path) + return -EOWNERDEAD; + + _cleanup_free_ char *j = NULL; + bool delete_root; + const char *d; + if (empty_or_root(suffix_path)) { + d = empty_to_root(crt->cgroup_path); + delete_root = false; /* Don't attempt to delete the main cgroup of this unit */ + } else { + j = path_join(crt->cgroup_path, suffix_path); + if (!j) + return -ENOMEM; + + d = j; + delete_root = true; + } + + log_unit_debug(u, "Removing subcgroup '%s'...", d); + + r = cg_trim_everywhere(u->manager->cgroup_supported, d, delete_root); + if (r < 0) + return log_unit_debug_errno(u, r, "Failed to fully %s cgroup '%s': %m", delete_root ? "remove" : "trim", d); + + return 0; +} + static bool unit_has_mask_realized( Unit *u, CGroupMask target_mask, diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 1ed74831c8a..807e56c6210 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -456,6 +456,7 @@ int unit_check_oomd_kill(Unit *u); int unit_check_oom(Unit *u); int unit_attach_pids_to_cgroup(Unit *u, Set *pids, const char *suffix_path); +int unit_remove_subcgroup(Unit *u, const char *suffix_path); int manager_setup_cgroup(Manager *m); void manager_shutdown_cgroup(Manager *m, bool delete); diff --git a/src/core/dbus-manager.c b/src/core/dbus-manager.c index ed1f1241826..99e3ea12ac5 100644 --- a/src/core/dbus-manager.c +++ b/src/core/dbus-manager.c @@ -960,6 +960,12 @@ static int method_attach_processes_to_unit(sd_bus_message *message, void *userda return method_generic_unit_operation(message, userdata, error, bus_unit_method_attach_processes, GENERIC_UNIT_VALIDATE_LOADED); } +static int method_remove_subgroup_from_unit(sd_bus_message *message, void *userdata, sd_bus_error *error) { + /* Don't allow removal of subgroups from units that aren't loaded. But allow loading the unit, since + * this is clean-up work, that is OK to do when the unit is stopped already. */ + return method_generic_unit_operation(message, userdata, error, bus_unit_method_remove_subgroup, GENERIC_UNIT_LOAD|GENERIC_UNIT_VALIDATE_LOADED); +} + static int transient_unit_from_message( Manager *m, sd_bus_message *message, @@ -3246,6 +3252,11 @@ const sd_bus_vtable bus_manager_vtable[] = { SD_BUS_NO_RESULT, method_attach_processes_to_unit, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RemoveSubgroupFromUnit", + SD_BUS_ARGS("s", unit_name, "s", subcgroup, "t", flags), + SD_BUS_NO_RESULT, + method_remove_subgroup_from_unit, + SD_BUS_VTABLE_UNPRIVILEGED), SD_BUS_METHOD_WITH_ARGS("AbandonScope", SD_BUS_ARGS("s", name), SD_BUS_NO_RESULT, diff --git a/src/core/dbus-unit.c b/src/core/dbus-unit.c index 5fa868b8ff8..68cfed9444f 100644 --- a/src/core/dbus-unit.c +++ b/src/core/dbus-unit.c @@ -1594,6 +1594,59 @@ int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd return sd_bus_reply_method_return(message, NULL); } +int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error) { + Unit *u = ASSERT_PTR(userdata); + int r; + + assert(message); + + /* This removes a subcgroup of the unit, regardless which user owns the subcgroup. This is useful + * when cgroup delegation is enabled for a unit, and the unit subdelegates the cgroup further */ + + r = mac_selinux_unit_access_check(u, message, "stop", error); + if (r < 0) + return r; + + const char *path; + uint64_t flags; + r = sd_bus_message_read(message, "st", &path, &flags); + if (r < 0) + return r; + + /* No flags defined for now. */ + if (flags != 0) + return sd_bus_reply_method_errorf(message, SD_BUS_ERROR_INVALID_ARGS, "Invalid 'flags' parameter '%" PRIu64 "'", flags); + + if (!unit_cgroup_delegate(u)) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Subcgroup removal not available on non-delegated units."); + + if (!path_is_absolute(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not absolute: %s", path); + + if (!path_is_normalized(path)) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Control group path is not normalized: %s", path); + + _cleanup_(sd_bus_creds_unrefp) sd_bus_creds *creds = NULL; + r = sd_bus_query_sender_creds(message, SD_BUS_CREDS_EUID, &creds); + if (r < 0) + return r; + + uid_t sender_uid; + r = sd_bus_creds_get_euid(creds, &sender_uid); + if (r < 0) + return r; + + /* Allow this only if the client is privileged, is us, or is the user of the unit itself. */ + if (sender_uid != 0 && sender_uid != getuid() && sender_uid != u->ref_uid) + return sd_bus_error_setf(error, SD_BUS_ERROR_ACCESS_DENIED, "Client is not permitted to alter cgroup."); + + r = unit_remove_subcgroup(u, path); + if (r < 0) + return sd_bus_error_set_errnof(error, r, "Failed to remove subgroup %s: %m", path); + + return sd_bus_reply_method_return(message, NULL); +} + const sd_bus_vtable bus_unit_cgroup_vtable[] = { SD_BUS_VTABLE_START(0), SD_BUS_PROPERTY("Slice", "s", property_get_slice, 0, 0), @@ -1633,6 +1686,12 @@ const sd_bus_vtable bus_unit_cgroup_vtable[] = { bus_unit_method_attach_processes, SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_METHOD_WITH_ARGS("RemoveSubgroup", + SD_BUS_ARGS("s", subcgroup, "t", flags), + SD_BUS_NO_RESULT, + bus_unit_method_remove_subgroup, + SD_BUS_VTABLE_UNPRIVILEGED), + SD_BUS_VTABLE_END }; diff --git a/src/core/dbus-unit.h b/src/core/dbus-unit.h index 6b7828e4bae..e9dd1ec3170 100644 --- a/src/core/dbus-unit.h +++ b/src/core/dbus-unit.h @@ -22,6 +22,7 @@ int bus_unit_set_properties(Unit *u, sd_bus_message *message, UnitWriteFlags fla int bus_unit_method_set_properties(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_get_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_attach_processes(sd_bus_message *message, void *userdata, sd_bus_error *error); +int bus_unit_method_remove_subgroup(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_ref(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_unref(sd_bus_message *message, void *userdata, sd_bus_error *error); int bus_unit_method_clean(sd_bus_message *message, void *userdata, sd_bus_error *error); diff --git a/src/core/org.freedesktop.systemd1.conf b/src/core/org.freedesktop.systemd1.conf index 52034e07e73..2b978a1e770 100644 --- a/src/core/org.freedesktop.systemd1.conf +++ b/src/core/org.freedesktop.systemd1.conf @@ -274,6 +274,10 @@ send_interface="org.freedesktop.systemd1.Manager" send_member="AttachProcessesToUnit"/> + + @@ -432,6 +436,10 @@ send_interface="org.freedesktop.systemd1.Service" send_member="AttachProcesses"/> + + @@ -446,6 +454,10 @@ send_interface="org.freedesktop.systemd1.Scope" send_member="AttachProcesses"/> + +