core: use LSM BPF functions to implement RestrictFileSystems=

It attaches the LSM BPF program when the system manager starts up. It populates the hash of maps BPF map when services that have RestrictFileSystems= set start. It cleans up the hash of maps when the unit cgroup is pruned. To pass the file descriptor of the BPF map we add it to the keep_fds array.
2024-11-01 00:51:24 +03:00 · 2021-10-05 13:18:49 +02:00 · 2021-10-05 13:18:49 +02:00 · b1994387d3
commit b1994387d3
parent 184b4f78cf
7 changed files with 99 additions and 1 deletions
--- a/src/basic/cgroup-util.h
+++ b/src/basic/cgroup-util.h
@ -33,6 +33,9 @@ typedef enum CGroupController {
        CGROUP_CONTROLLER_BPF_FOREIGN,
        CGROUP_CONTROLLER_BPF_SOCKET_BIND,
        CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES,
+        /* The BPF hook implementing RestrictFileSystems= is not defined here.
+         * It's applied as late as possible in exec_child() so we don't block
+         * our own unit setup code. */

        _CGROUP_CONTROLLER_MAX,
        _CGROUP_CONTROLLER_INVALID = -EINVAL,
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@ -37,6 +37,12 @@
 #include "string-util.h"
 #include "virt.h"

+#if BPF_FRAMEWORK
+#include "bpf-dlopen.h"
+#include "bpf-link.h"
+#include "bpf/restrict_fs/restrict-fs-skel.h"
+#endif
+
 #define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)

 /* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
@ -2736,6 +2742,10 @@ void unit_prune_cgroup(Unit *u) {

        (void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */

+#if BPF_FRAMEWORK
+        (void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */
+#endif
+
        is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);

        r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);
--- a/src/core/cgroup.h
+++ b/src/core/cgroup.h
@ -3,6 +3,7 @@

 #include <stdbool.h>

+#include "bpf-lsm.h"
 #include "cgroup-util.h"
 #include "cpu-set-util.h"
 #include "list.h"
--- a/src/core/execute.c
+++ b/src/core/execute.c
@ -41,6 +41,7 @@
 #endif
 #include "async.h"
 #include "barrier.h"
+#include "bpf-lsm.h"
 #include "cap-list.h"
 #include "capability-util.h"
 #include "cgroup-setup.h"
@ -1685,6 +1686,29 @@ static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
        return seccomp_restrict_namespaces(c->restrict_namespaces);
 }

+#if HAVE_LIBBPF
+static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
+        if (lsm_bpf_supported())
+                return false;
+
+        log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
+        return true;
+}
+
+static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
+        assert(u);
+        assert(c);
+
+        if (!exec_context_restrict_filesystems_set(c))
+                return 0;
+
+        if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
+                return 0;
+
+        return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
+}
+#endif
+
 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
        unsigned long personality;
        int r;
@ -3813,7 +3837,7 @@ static int exec_child(
        /* In case anything used libc syslog(), close this here, too */
        closelog();

-        int keep_fds[n_fds + 2];
+        int keep_fds[n_fds + 3];
        memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
        n_keep_fds = n_fds;

@ -3823,6 +3847,24 @@ static int exec_child(
                return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
        }

+#if HAVE_LIBBPF
+        if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
+                int bpf_map_fd = -1;
+
+                bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
+                if (bpf_map_fd < 0) {
+                        *exit_status = EXIT_FDS;
+                        return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
+                }
+
+                r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
+                if (r < 0) {
+                        *exit_status = EXIT_FDS;
+                        return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
+                }
+        }
+#endif
+
        r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
        if (r < 0) {
                *exit_status = EXIT_FDS;
@ -4682,6 +4724,15 @@ static int exec_child(
                        return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
                }
 #endif
+
+#if HAVE_LIBBPF
+                r = apply_restrict_filesystems(unit, context);
+                if (r < 0) {
+                        *exit_status = EXIT_BPF;
+                        return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
+                }
+#endif
+
        }

        if (!strv_isempty(context->unset_environment)) {
@ -4967,6 +5018,8 @@ void exec_context_done(ExecContext *c) {
        c->apparmor_profile = mfree(c->apparmor_profile);
        c->smack_process_label = mfree(c->smack_process_label);

+        c->restrict_filesystems = set_free(c->restrict_filesystems);
+
        c->syscall_filter = hashmap_free(c->syscall_filter);
        c->syscall_archs = set_free(c->syscall_archs);
        c->address_families = set_free(c->address_families);
@ -5734,6 +5787,12 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
                                prefix, strna(s));
        }

+#if HAVE_LIBBPF
+        if (exec_context_restrict_filesystems_set(c))
+                SET_FOREACH(e, c->restrict_filesystems)
+                        fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
+#endif
+
        if (c->network_namespace_path)
                fprintf(f,
                        "%sNetworkNamespacePath: %s\n",
--- a/src/core/execute.h
+++ b/src/core/execute.h
@ -314,6 +314,9 @@ struct ExecContext {

        unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */

+        Set *restrict_filesystems;
+        bool restrict_filesystems_allow_list:1;
+
        Hashmap *syscall_filter;
        Set *syscall_archs;
        int syscall_errno;
@ -342,6 +345,13 @@ static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
        return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL;
 }

+static inline bool exec_context_restrict_filesystems_set(const ExecContext *c) {
+        assert(c);
+
+        return c->restrict_filesystems_allow_list ||
+          !set_isempty(c->restrict_filesystems);
+}
+
 static inline bool exec_context_with_rootfs(const ExecContext *c) {
        assert(c);

--- a/src/core/main.c
+++ b/src/core/main.c
@ -22,6 +22,9 @@
 #include "alloc-util.h"
 #include "apparmor-setup.h"
 #include "architecture.h"
+#if HAVE_LIBBPF
+#include "bpf-lsm.h"
+#endif
 #include "build.h"
 #include "bus-error.h"
 #include "bus-util.h"
--- a/src/core/manager.c
+++ b/src/core/manager.c
@ -930,6 +930,14 @@ int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager
                r = manager_setup_sigchld_event_source(m);
                if (r < 0)
                        return r;
+
+#if HAVE_LIBBPF
+                if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported()) {
+                        r = lsm_bpf_setup(m);
+                        if (r < 0)
+                                return r;
+                }
+#endif
        }

        if (test_run_flags == 0) {
@ -1535,6 +1543,10 @@ Manager* manager_free(Manager *m) {
                m->prefix[dt] = mfree(m->prefix[dt]);
        free(m->received_credentials);

+#if BPF_FRAMEWORK
+        lsm_bpf_destroy(m->restrict_fs);
+#endif
+
        return mfree(m);
 }