1
0
mirror of https://github.com/systemd/systemd.git synced 2025-03-31 14:50:15 +03:00

Merge pull request #18145 from kinvolk/iaguis/lsm-bpf

Add RestrictFileSystems= property using LSM BPF
This commit is contained in:
Lennart Poettering 2021-10-06 16:23:27 +02:00 committed by GitHub
commit 9a1ddc8dee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
48 changed files with 1808 additions and 52 deletions

9
README
View File

@ -35,7 +35,7 @@ REQUIREMENTS:
Linux kernel >= 4.17 for cgroup-bpf socket address hooks
Linux kernel >= 5.3 for bounded-loops in BPF program
Linux kernel >= 5.4 for signed Verity images support
Linux kernel >= 5.7 for BPF links
Linux kernel >= 5.7 for BPF links and the BPF LSM hook
Kernel Config Options:
CONFIG_DEVTMPFS
@ -119,6 +119,13 @@ REQUIREMENTS:
Required for signed Verity images support:
CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG
Required for RestrictFileSystems= in service units:
CONFIG_BPF
CONFIG_BPF_SYSCALL
CONFIG_BPF_LSM
CONFIG_DEBUG_INFO_BTF
CONFIG_LSM="...,bpf" or kernel booted with lsm="...,bpf".
We recommend to turn off Real-Time group scheduling in the
kernel when using systemd. RT group scheduling effectively
makes RT scheduling unavailable for most userspace, since it

View File

@ -2837,6 +2837,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindReadOnlyPaths = [...];
@ -3362,6 +3364,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<!--property RestrictNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<!--property BindReadOnlyPaths is not documented!-->
@ -3966,6 +3970,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindReadOnlyPaths"/>
@ -4677,6 +4683,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindReadOnlyPaths = [...];
@ -5228,6 +5236,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<!--property RestrictNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<!--property BindReadOnlyPaths is not documented!-->
@ -5828,6 +5838,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindReadOnlyPaths"/>
@ -6436,6 +6448,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindReadOnlyPaths = [...];
@ -6915,6 +6929,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<!--property RestrictNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<!--property BindReadOnlyPaths is not documented!-->
@ -7433,6 +7449,8 @@ node /org/freedesktop/systemd1/unit/home_2emount {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindReadOnlyPaths"/>
@ -8162,6 +8180,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly t RestrictNamespaces = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly (bas) RestrictFileSystems = ...;
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindPaths = [...];
@org.freedesktop.DBus.Property.EmitsChangedSignal("const")
readonly a(ssbt) BindReadOnlyPaths = [...];
@ -8627,6 +8647,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<!--property RestrictNamespaces is not documented!-->
<!--property RestrictFileSystems is not documented!-->
<!--property BindPaths is not documented!-->
<!--property BindReadOnlyPaths is not documented!-->
@ -9131,6 +9153,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap {
<variablelist class="dbus-property" generated="True" extra-ref="RestrictNamespaces"/>
<variablelist class="dbus-property" generated="True" extra-ref="RestrictFileSystems"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindPaths"/>
<variablelist class="dbus-property" generated="True" extra-ref="BindReadOnlyPaths"/>

View File

@ -88,6 +88,12 @@
<arg choice="plain">syscall-filter</arg>
<arg choice="opt"><replaceable>SET</replaceable></arg>
</cmdsynopsis>
<cmdsynopsis>
<command>systemd-analyze</command>
<arg choice="opt" rep="repeat">OPTIONS</arg>
<arg choice="plain">filesystems</arg>
<arg choice="opt"><replaceable>SET</replaceable></arg>
</cmdsynopsis>
<cmdsynopsis>
<command>systemd-analyze</command>
<arg choice="opt" rep="repeat">OPTIONS</arg>
@ -410,6 +416,14 @@ Conditions succeeded.</programlisting>
<replaceable>SET</replaceable> must include the <literal>@</literal> prefix.</para>
</refsect2>
<refsect2>
<title><command>systemd-analyze filesystems <optional><replaceable>SET</replaceable>...</optional></command></title>
<para>This command will list filesystems in the specified filesystem set
<replaceable>SET</replaceable>, or all known sets if no sets are specified. Argument
<replaceable>SET</replaceable> must include the <literal>@</literal> prefix.</para>
</refsect2>
<refsect2>
<title><command>systemd-analyze calendar <replaceable>EXPRESSION</replaceable>...</command></title>

View File

@ -1842,6 +1842,100 @@ BindReadOnlyPaths=/var/lib/systemd</programlisting>
logging.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RestrictFileSystems=</varname></term>
<listitem><para>Restricts the set of filesystems processes of this unit can open files on. Takes a space-separated
list of filesystem names. Any filesystem listed is made accessible to the unit's processes, access to filesystem
types not listed is prohibited (allow-listing). If the first character of the list is <literal>~</literal>, the
effect is inverted: access to the filesystems listed is prohibited (deny-listing). If the empty string is assigned,
access to filesystems is not restricted.</para>
<para>If you specify both types of this option (i.e. allow-listing and deny-listing), the first encountered will take
precedence and will dictate the default action (allow access to the filesystem or deny it). Then the next occurrences
of this option will add or delete the listed filesystems from the set of the restricted filesystems, depending on its
type and the default action.</para>
<para>Example: if a unit has the following,
<programlisting>RestrictFileSystems=ext4 tmpfs
RestrictFileSystems=ext2 ext4</programlisting>
then access to <constant>ext4</constant>, <constant>tmpfs</constant>, and <constant>ext2</constant> is allowed
and access to other filesystems is denied.</para>
<para>Example: if a unit has the following,
<programlisting>RestrictFileSystems=ext4 tmpfs
RestrictFileSystems=~ext4</programlisting>
then only access <constant>tmpfs</constant> is allowed.</para>
<para>Example: if a unit has the following,
<programlisting>RestrictFileSystems=~ext4 tmpfs
RestrictFileSystems=ext4</programlisting>
then only access to <constant>tmpfs</constant> is denied.</para>
<para>As the number of possible filesystems is large, predefined sets of filesystems are provided. A set
starts with <literal>@</literal> character, followed by name of the set.</para>
<table>
<title>Currently predefined filesystem sets</title>
<tgroup cols='2'>
<colspec colname='set' />
<colspec colname='description' />
<thead>
<row>
<entry>Set</entry>
<entry>Description</entry>
</row>
</thead>
<tbody>
<row>
<entry>@basic-api</entry>
<entry>Basic filesystem API.</entry>
</row>
<row>
<entry>@auxiliary-api</entry>
<entry>Auxiliary filesystem API.</entry>
</row>
<row>
<entry>@common-block</entry>
<entry>Common block device filesystems.</entry>
</row>
<row>
<entry>@historical-block</entry>
<entry>Historical block device filesystems.</entry>
</row>
<row>
<entry>@network</entry>
<entry>Well-known network filesystems.</entry>
</row>
<row>
<entry>@privileged-api</entry>
<entry>Privileged filesystem API.</entry>
</row>
<row>
<entry>@temporary</entry>
<entry>Temporary filesystems: tmpfs, ramfs.</entry>
</row>
<row>
<entry>@known</entry>
<entry>All known filesystems defined by the kernel. This list is defined statically in systemd based on a kernel
version that was available when this systemd version was released. It will become progressively more
out-of-date as the kernel is updated.</entry>
</row>
</tbody>
</tgroup>
</table>
<para>Use
<citerefentry><refentrytitle>systemd-analyze</refentrytitle><manvolnum>1</manvolnum></citerefentry>'s
<command>filesystems</command> command to retrieve a list of filesystems defined on the local
system.</para>
<para>Note that this setting might not be supported on some systems (for example if the LSM eBPF hook is
not enabled in the underlying kernel or if not using the unified control group hierarchy). In that case this setting
has no effect.</para></listitem>
</varlistentry>
<varlistentry>
<term><varname>RestrictNamespaces=</varname></term>
@ -3761,6 +3855,11 @@ StandardInputData=SWNrIHNpdHplIGRhIHVuJyBlc3NlIEtsb3BzLAp1ZmYgZWVtYWwga2xvcHAncy
<entry><constant>EXIT_CREDENTIALS</constant></entry>
<entry>Failed to set up unit's credentials. See <varname>LoadCredential=</varname> and <varname>SetCredential=</varname> above.</entry>
</row>
<row>
<entry>245</entry>
<entry><constant>EXIT_BPF</constant></entry>
<entry>Failed to apply BPF restrictions. See <varname>RestrictFileSystems=</varname> above.</entry>
</row>
</tbody>
</tgroup>
</table>

View File

@ -12,6 +12,7 @@ Distribution=arch
BuildPackages=
acl
bzip2
clang
cryptsetup
curl
dbus
@ -26,6 +27,7 @@ BuildPackages=
inetutils
iptables
kmod
libbpf
libcap
libgcrypt
libidn2
@ -34,6 +36,7 @@ BuildPackages=
libutil-linux
libxkbcommon
libxslt
llvm
lz4
meson
pam
@ -48,6 +51,7 @@ BuildPackages=
Packages=
gdb
libbpf
libidn2
nano
qrencode

View File

@ -10,9 +10,11 @@ Release=unstable
[Packages]
BuildPackages=
acl
clang
docbook-xml
docbook-xsl
gcc
g++
gettext
git
gnu-efi
@ -20,6 +22,7 @@ BuildPackages=
libacl1-dev
libaudit-dev
libblkid-dev
libbpf-dev
libbz2-dev
libcap-dev
libcryptsetup-dev
@ -46,6 +49,7 @@ BuildPackages=
libtss2-dev
libxkbcommon-dev
libzstd-dev
llvm
meson
pkg-config
python3
@ -59,6 +63,7 @@ BuildPackages=
Packages=
gdb
libbpf0
libfdisk1
libfido2-1
libidn2-0

View File

@ -28,6 +28,7 @@
#include "exit-status.h"
#include "fd-util.h"
#include "fileio.h"
#include "filesystems.h"
#include "format-table.h"
#include "glob-util.h"
#include "hashmap.h"
@ -46,6 +47,7 @@
#endif
#include "sort-util.h"
#include "special.h"
#include "stat-util.h"
#include "string-table.h"
#include "strv.h"
#include "strxcpyx.h"
@ -1622,6 +1624,9 @@ static int load_kernel_syscalls(Set **ret) {
static void syscall_set_remove(Set *s, const SyscallFilterSet *set) {
const char *syscall;
if (!set)
return;
NULSTR_FOREACH(syscall, set->value) {
if (syscall[0] == '@')
continue;
@ -1743,6 +1748,172 @@ static int dump_syscall_filters(int argc, char *argv[], void *userdata) {
}
#endif
static int load_available_kernel_filesystems(Set **ret) {
_cleanup_set_free_ Set *filesystems = NULL;
int r;
char *t;
assert(ret);
/* Let's read the available filesystems */
r = read_virtual_file("/proc/filesystems", SIZE_MAX, &t, NULL);
if (r < 0)
return r;
for (int i = 0;;) {
_cleanup_free_ char *line = NULL;
const char *p;
r = string_extract_line(t, i++, &line);
if (r < 0)
return log_oom();
if (r == 0)
break;
if (!line)
line = t;
p = strchr(line, '\t');
if (!p)
continue;
p += strspn(p, WHITESPACE);
r = set_put_strdup(&filesystems, p);
if (r < 0)
return log_error_errno(r, "Failed to add filesystem to list: %m");
}
*ret = TAKE_PTR(filesystems);
return 0;
}
static void filesystem_set_remove(Set *s, const FilesystemSet *set) {
const char *filesystem;
NULSTR_FOREACH(filesystem, set->value) {
if (filesystem[0] == '@')
continue;
free(set_remove(s, filesystem));
}
}
static void dump_filesystem(const FilesystemSet *set) {
const char *filesystem;
if (!set)
return;
printf("%s%s%s\n"
" # %s\n",
ansi_highlight(),
set->name,
ansi_normal(),
set->help);
NULSTR_FOREACH(filesystem, set->value)
printf(" %s%s%s\n", filesystem[0] == '@' ? ansi_underline() : "", filesystem, ansi_normal());
}
static int dump_filesystems(int argc, char *argv[], void *userdata) {
bool first = true;
#if ! HAVE_LIBBPF
return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Not compiled with libbpf support, sorry.");
#endif
(void) pager_open(arg_pager_flags);
if (strv_isempty(strv_skip(argv, 1))) {
_cleanup_set_free_ Set *kernel = NULL, *known = NULL;
const char *fs;
int k;
NULSTR_FOREACH(fs, filesystem_sets[FILESYSTEM_SET_KNOWN].value)
if (set_put_strdup(&known, fs) < 0)
return log_oom();
k = load_available_kernel_filesystems(&kernel);
for (FilesystemGroups i = 0; i < _FILESYSTEM_SET_MAX; i++) {
const FilesystemSet *set = filesystem_sets + i;
if (!first)
puts("");
dump_filesystem(set);
filesystem_set_remove(kernel, set);
if (i != FILESYSTEM_SET_KNOWN)
filesystem_set_remove(known, set);
first = false;
}
if (!set_isempty(known)) {
_cleanup_free_ char **l = NULL;
char **filesystem;
printf("\n"
"# %sUngrouped filesystems%s (known but not included in any of the groups except @known):\n",
ansi_highlight(), ansi_normal());
l = set_get_strv(known);
if (!l)
return log_oom();
strv_sort(l);
STRV_FOREACH(filesystem, l)
printf("# %s\n", *filesystem);
}
if (k < 0) {
fputc('\n', stdout);
fflush(stdout);
log_notice_errno(k, "# Not showing unlisted filesystems, couldn't retrieve kernel filesystem list: %m");
} else if (!set_isempty(kernel)) {
_cleanup_free_ char **l = NULL;
char **filesystem;
printf("\n"
"# %sUnlisted filesystems%s (available to the local kernel, but not included in any of the groups listed above):\n",
ansi_highlight(), ansi_normal());
l = set_get_strv(kernel);
if (!l)
return log_oom();
strv_sort(l);
STRV_FOREACH(filesystem, l)
printf("# %s\n", *filesystem);
}
} else {
char **name;
STRV_FOREACH(name, strv_skip(argv, 1)) {
const FilesystemSet *set;
if (!first)
puts("");
set = filesystem_set_find(*name);
if (!set) {
/* make sure the error appears below normal output */
fflush(stdout);
return log_error_errno(SYNTHETIC_ERRNO(ENOENT),
"Filesystem set \"%s\" not found.", *name);
}
dump_filesystem(set);
first = false;
}
}
return 0;
}
static void parsing_hint(const char *p, bool calendar, bool timestamp, bool timespan) {
if (calendar && calendar_spec_from_string(p, NULL) >= 0)
log_notice("Hint: this expression is a valid calendar specification. "
@ -2169,6 +2340,7 @@ static int help(int argc, char *argv[], void *userdata) {
" capability [CAP...] List capability definitions\n"
" syscall-filter [NAME...] Print list of syscalls in seccomp\n"
" filter\n"
" filesystems [NAME...] Print list of filesystems\n"
" condition CONDITION... Evaluate conditions and asserts\n"
" verify FILE... Check unit files for correctness\n"
" calendar SPEC... Validate repetitive calendar time\n"
@ -2503,6 +2675,7 @@ static int run(int argc, char *argv[]) {
{ "exit-status", VERB_ANY, VERB_ANY, 0, dump_exit_status },
{ "syscall-filter", VERB_ANY, VERB_ANY, 0, dump_syscall_filters },
{ "capability", VERB_ANY, VERB_ANY, 0, dump_capabilities },
{ "filesystems", VERB_ANY, VERB_ANY, 0, dump_filesystems },
{ "condition", VERB_ANY, VERB_ANY, 0, do_condition },
{ "verify", 2, VERB_ANY, 0, do_verify },
{ "calendar", 2, VERB_ANY, 0, test_calendar },

View File

@ -1367,6 +1367,29 @@ int cg_pid_get_machine_name(pid_t pid, char **machine) {
return cg_path_get_machine_name(cgroup, machine);
}
int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
int mnt_id = -1;
assert(path);
assert(ret);
union {
struct file_handle f_handle;
uint8_t space[offsetof(struct file_handle, f_handle) + sizeof(uint64_t)];
} buf = {
.f_handle.handle_bytes = sizeof(uint64_t),
};
/* This is cgroupfs so we know the size of the handle, thus no need to loop around like
* name_to_handle_at_loop() does in mountpoint-util.c */
if (name_to_handle_at(AT_FDCWD, path, &buf.f_handle, &mnt_id, 0) < 0)
return -errno;
*ret = *(uint64_t *) buf.f_handle.f_handle;
return 0;
}
int cg_path_get_session(const char *path, char **session) {
_cleanup_free_ char *unit = NULL;
char *start, *end;

View File

@ -33,6 +33,9 @@ typedef enum CGroupController {
CGROUP_CONTROLLER_BPF_FOREIGN,
CGROUP_CONTROLLER_BPF_SOCKET_BIND,
CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES,
/* The BPF hook implementing RestrictFileSystems= is not defined here.
* It's applied as late as possible in exec_child() so we don't block
* our own unit setup code. */
_CGROUP_CONTROLLER_MAX,
_CGROUP_CONTROLLER_INVALID = -EINVAL,
@ -236,6 +239,7 @@ int cg_is_empty_recursive(const char *controller, const char *path);
int cg_get_root_path(char **path);
int cg_path_get_cgroupid(const char *path, uint64_t *ret);
int cg_path_get_session(const char *path, char **session);
int cg_path_get_owner_uid(const char *path, uid_t *uid);
int cg_path_get_unit(const char *path, char **unit);

36
src/basic/check-filesystems.sh Executable file
View File

@ -0,0 +1,36 @@
#!/bin/bash
# SPDX-License-Identifier: LGPL-2.1-or-later
set -eu
set -o pipefail
cpp="$1"
filesystems_gperf="$2"
shift 2
includes=""
for i in "$@"; do
includes="$includes -include $i"
done
error=false
# shellcheck disable=SC2086
for fs in $($cpp -dM $includes - </dev/null | \
grep -E '_MAGIC' | \
grep -vE 'LINUX_MAGIC' | \
awk '/^#define[ \t]+[A-Z0-9_]+MAGIC[ \t]+/ { print $2; }'); do
if ! grep -E "\{.*$fs.*\}" "$filesystems_gperf" >/dev/null; then
# STACK_END_MAGIC doesn't refer to a filesystem
# mtd_inode was removed in 2015
# futexfs was removed in 2018
if [[ "$fs" =~ ^(STACK_END_MAGIC|MTD_INODE_FS_MAGIC|FUTEXFS_SUPER_MAGIC)$ ]]; then
continue
fi
echo "Filesystem found in kernel header but not in $(basename "$filesystems_gperf"): $fs";
error=true
fi
done
if $error; then
exit 1
fi

View File

@ -0,0 +1,112 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
%{
#include <linux/magic.h>
#include "filesystems.h"
#include "missing_magic.h"
#include "stat-util.h"
struct FilesystemMagic {
const char *name;
statfs_f_type_t magic[FILESYSTEM_MAGIC_MAX];
};
%}
struct FilesystemMagic;
%language=ANSI-C
%define hash-function-name filesystems_gperf_hash
%define lookup-function-name filesystems_gperf_lookup
%define slot-name name
%readonly-tables
%omit-struct-type
%struct-type
%includes
%%
apparmorfs, {AAFS_MAGIC}
adfs, {ADFS_SUPER_MAGIC}
affs, {AFFS_SUPER_MAGIC}
afs, {AFS_FS_MAGIC, AFS_SUPER_MAGIC}
anon_inodefs, {ANON_INODE_FS_MAGIC}
autofs, {AUTOFS_SUPER_MAGIC}
balloon-kvm, {BALLOON_KVM_MAGIC}
bdev, {BDEVFS_MAGIC}
binder, {BINDERFS_SUPER_MAGIC}
binfmt_misc, {BINFMTFS_MAGIC}
bpf, {BPF_FS_MAGIC}
btrfs, {BTRFS_SUPER_MAGIC}
btrfs_test_fs, {BTRFS_TEST_MAGIC}
ceph, {CEPH_SUPER_MAGIC}
cgroup2, {CGROUP2_SUPER_MAGIC}
cgroup, {CGROUP_SUPER_MAGIC}
cifs, {CIFS_MAGIC_NUMBER}
coda, {CODA_SUPER_MAGIC}
configfs, {CONFIGFS_MAGIC}
cramfs, {CRAMFS_MAGIC}
dax, {DAXFS_MAGIC}
debugfs, {DEBUGFS_MAGIC}
devmem, {DEVMEM_MAGIC}
devpts, {DEVPTS_SUPER_MAGIC}
dmabuf, {DMA_BUF_MAGIC}
ecryptfs, {ECRYPTFS_SUPER_MAGIC}
efivarfs, {EFIVARFS_MAGIC}
efs, {EFS_SUPER_MAGIC}
erofs, {EROFS_SUPER_MAGIC_V1}
ext2, {EXT2_SUPER_MAGIC}
ext3, {EXT3_SUPER_MAGIC}
ext4, {EXT4_SUPER_MAGIC}
exfat, {EXFAT_SUPER_MAGIC}
f2fs, {F2FS_SUPER_MAGIC}
fuseblk, {FUSE_SUPER_MAGIC}
fuse, {FUSE_SUPER_MAGIC}
fusectl, {FUSE_CTL_SUPER_MAGIC}
gfs, {GFS2_MAGIC}
gfs2, {GFS2_MAGIC}
hostfs, {HOSTFS_SUPER_MAGIC}
hpfs, {HPFS_SUPER_MAGIC}
hugetlbfs, {HUGETLBFS_MAGIC}
iso9660, {ISOFS_SUPER_MAGIC}
jffs2, {JFFS2_SUPER_MAGIC}
minix, {MINIX_SUPER_MAGIC, MINIX_SUPER_MAGIC2, MINIX2_SUPER_MAGIC, MINIX2_SUPER_MAGIC2, MINIX3_SUPER_MAGIC}
mqueue, {MQUEUE_MAGIC}
msdos, {MSDOS_SUPER_MAGIC}
ncp, {NCP_SUPER_MAGIC}
ncpfs, {NCP_SUPER_MAGIC}
nfs, {NFS_SUPER_MAGIC}
nfs4, {NFS_SUPER_MAGIC}
nilfs2, {NILFS_SUPER_MAGIC}
nsfs, {NSFS_MAGIC}
ocfs2, {OCFS2_SUPER_MAGIC}
openpromfs, {OPENPROM_SUPER_MAGIC}
orangefs, {ORANGEFS_DEVREQ_MAGIC}
overlay, {OVERLAYFS_SUPER_MAGIC}
pipefs, {PIPEFS_MAGIC}
ppc-cmm, {PPC_CMM_MAGIC}
proc, {PROC_SUPER_MAGIC}
pstore, {PSTOREFS_MAGIC}
pvfs2, {ORANGEFS_DEVREQ_MAGIC}
qnx4, {QNX4_SUPER_MAGIC}
qnx6, {QNX6_SUPER_MAGIC}
ramfs, {RAMFS_MAGIC}
resctrl, {RDTGROUP_SUPER_MAGIC}
reiserfs, {REISERFS_SUPER_MAGIC}
secretmem, {SECRETMEM_MAGIC}
securityfs, {SECURITYFS_MAGIC}
selinuxfs, {SELINUX_MAGIC}
shiftfs, {SHIFTFS_MAGIC}
smackfs, {SMACK_MAGIC}
smb3, {SMB_SUPER_MAGIC}
smbfs, {SMB_SUPER_MAGIC}
sockfs, {SOCKFS_MAGIC}
squashfs, {SQUASHFS_MAGIC}
sysfs, {SYSFS_MAGIC}
tmpfs, {TMPFS_MAGIC}
tracefs, {TRACEFS_MAGIC}
udf, {UDF_SUPER_MAGIC}
usbdevfs, {USBDEVICE_SUPER_MAGIC}
vboxsf, {VBOXSF_SUPER_MAGIC}
vfat, {MSDOS_SUPER_MAGIC}
v9fs, {V9FS_MAGIC}
xenfs, {XENFS_SUPER_MAGIC}
xfs, {XFS_SUPER_MAGIC}
z3fold, {Z3FOLD_MAGIC}
zonefs, {ZONEFS_MAGIC}
zsmalloc, {ZSMALLOC_MAGIC}

131
src/basic/filesystems.c Normal file
View File

@ -0,0 +1,131 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "filesystems-gperf.h"
int fs_type_from_string(const char *name, const statfs_f_type_t **ret) {
const struct FilesystemMagic *fs_magic;
assert(name);
assert(ret);
fs_magic = filesystems_gperf_lookup(name, strlen(name));
if (!fs_magic)
return -EINVAL;
*ret = fs_magic->magic;
return 0;
}
int fs_in_group(const struct statfs *s, FilesystemGroups fs_group) {
const char *fs;
int r;
NULSTR_FOREACH(fs, filesystem_sets[fs_group].value) {
const statfs_f_type_t *magic;
r = fs_type_from_string(fs, &magic);
if (r == 0) {
for (size_t i = 0; i < FILESYSTEM_MAGIC_MAX; i++) {
if (magic[i] == 0)
break;
if (is_fs_type(s, magic[i]))
return true;
}
}
}
return false;
}
const FilesystemSet filesystem_sets[_FILESYSTEM_SET_MAX] = {
[FILESYSTEM_SET_BASIC_API] = {
.name = "@basic-api",
.help = "Basic filesystem API",
.value =
"cgroup\0"
"cgroup2\0"
"devpts\0"
"mqueue\0"
"proc\0"
"sysfs\0"
},
[FILESYSTEM_SET_AUXILIARY_API] = {
.name = "@auxiliary-api",
.help = "Auxiliary filesystem API",
.value =
"configfs\0"
"efivarfs\0"
"fusectl\0"
"hugetlbfs\0"
"securityfs\0"
},
[FILESYSTEM_SET_COMMON_BLOCK] = {
.name = "@common-block",
.help = "Common block device filesystems",
.value =
"btrfs\0"
"ext4\0"
"vfat\0"
"xfs\0"
},
[FILESYSTEM_SET_HISTORICAL_BLOCK] = {
.name = "@historical-block",
.help = "Historical block device filesystems",
.value =
"ext2\0"
"ext3\0"
"minix\0"
},
[FILESYSTEM_SET_NETWORK] = {
.name = "@network",
.help = "Well-known network filesystems",
.value =
"afs\0"
"cifs\0"
"gfs\0"
"gfs2\0"
"ncpfs\0"
"ncp\0"
"nfs\0"
"nfs4\0"
"ocfs2\0"
"pvfs2\0"
"smb3\0"
"smbfs\0"
},
[FILESYSTEM_SET_PRIVILEGED_API] = {
.name = "@privileged-api",
.help = "Privileged filesystem API",
.value =
"bpf\0"
"debugfs\0"
"pstore\0"
"tracefs\0"
},
[FILESYSTEM_SET_TEMPORARY] = {
.name = "@temporary",
.help = "Temporary filesystems",
.value =
"ramfs\0"
"tmpfs\0"
},
[FILESYSTEM_SET_KNOWN] = {
.name = "@known",
.help = "All known filesystems declared in the kernel",
.value =
#include "filesystem-list.h"
},
};
const FilesystemSet *filesystem_set_find(const char *name) {
if (isempty(name) || name[0] != '@')
return NULL;
for (FilesystemGroups i = 0; i < _FILESYSTEM_SET_MAX; i++)
if (streq(filesystem_sets[i].name, name))
return filesystem_sets + i;
return NULL;
}

38
src/basic/filesystems.h Normal file
View File

@ -0,0 +1,38 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include "nulstr-util.h"
#include "stat-util.h"
#include "string-util.h"
#define FILESYSTEM_MAGIC_MAX 10
typedef enum FilesystemGroups {
/* Please leave BASIC_API first and KNOWN last, but sort the rest alphabetically */
FILESYSTEM_SET_BASIC_API,
FILESYSTEM_SET_AUXILIARY_API,
FILESYSTEM_SET_COMMON_BLOCK,
FILESYSTEM_SET_HISTORICAL_BLOCK,
FILESYSTEM_SET_NETWORK,
FILESYSTEM_SET_PRIVILEGED_API,
FILESYSTEM_SET_TEMPORARY,
FILESYSTEM_SET_KNOWN,
_FILESYSTEM_SET_MAX,
_FILESYSTEM_SET_INVALID = -EINVAL,
} FilesystemGroups;
typedef struct FilesystemSet {
const char *name;
const char *help;
const char *value;
} FilesystemSet;
extern const FilesystemSet filesystem_sets[];
const FilesystemSet *filesystem_set_find(const char *name);
int fs_type_from_string(const char *name, const statfs_f_type_t **ret);
int fs_in_group(const struct statfs *s, enum FilesystemGroups fs_group);
/* gperf prototypes */
const struct FilesystemMagic* filesystems_gperf_lookup(const char *key, GPERF_LEN_TYPE length);

View File

@ -0,0 +1,12 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: LGPL-2.1-or-later
import sys
keywords_section = False
for line in open(sys.argv[1]):
if keywords_section:
print('"{}\\0"'.format(line.split(',')[0].strip()))
elif line.startswith('%%'):
keywords_section = True

View File

@ -54,6 +54,8 @@ basic_sources = files('''
fd-util.h
fileio.c
fileio.h
filesystems.c
filesystems.h
format-util.c
format-util.h
fs-util.c
@ -373,6 +375,34 @@ run_target(
############################################################
filesystem_includes = ['linux/magic.h',
'linux/gfs2_ondisk.h']
check_filesystems = find_program('check-filesystems.sh')
r = run_command([check_filesystems, cpp, 'filesystems-gperf.gperf'] + filesystem_includes)
if r.returncode() != 0
error('found unknown filesystem(s) defined in kernel headers:\n\n' + r.stdout())
r.stdout()
endif
filesystems_gperf_h = custom_target(
'filesystems-gperf.h',
input : 'filesystems-gperf.gperf',
output : 'filesystems-gperf.h',
command : [gperf, '@INPUT@', '--output-file', '@OUTPUT@'])
generate_filesystem_list = find_program('generate-filesystem-list.py')
fname = 'filesystem-list.h'
filesystem_list_h = custom_target(
fname,
input : 'filesystems-gperf.gperf',
output : fname,
command : [generate_filesystem_list,
'@INPUT@'],
capture : true)
basic_sources += [filesystem_list_h, filesystems_gperf_h]
libbasic = static_library(
'basic',
basic_sources,

View File

@ -52,11 +52,6 @@
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#endif
/* Not exposed yet. Defined at fs/cifs/cifsglob.h */
#ifndef CIFS_MAGIC_NUMBER
#define CIFS_MAGIC_NUMBER 0xFF534D42
#endif
/* linux/nsfs.h */
#ifndef NS_GET_NSTYPE /* d95fa3c76a66b6d76b1e109ea505c55e66360f3c (4.11) */
#define NS_GET_NSTYPE _IO(0xb7, 0x3)

View File

@ -37,3 +37,138 @@
#ifndef XFS_SB_MAGIC
#define XFS_SB_MAGIC 0x58465342
#endif
/* Not exposed yet. Defined at fs/cifs/cifsglob.h */
#ifndef CIFS_MAGIC_NUMBER
#define CIFS_MAGIC_NUMBER 0xFF534D42
#endif
/* 257f871993474e2bde6c497b54022c362cf398e1 (4.5) */
#ifndef OVERLAYFS_SUPER_MAGIC
#define OVERLAYFS_SUPER_MAGIC 0x794c7630
#endif
/* 2a28900be20640fcd1e548b1e3bad79e8221fcf9 (4.7) */
#ifndef UDF_SUPER_MAGIC
#define UDF_SUPER_MAGIC 0x15013346
#endif
/* b1123ea6d3b3da25af5c8a9d843bd07ab63213f4 (4.8)*/
#ifndef BALLOON_KVM_MAGIC
#define BALLOON_KVM_MAGIC 0x13661366
#endif
/* 48b4800a1c6af2cdda344ea4e2c843dcc1f6afc9 (4.8) */
#ifndef ZSMALLOC_MAGIC
#define ZSMALLOC_MAGIC 0x58295829
#endif
/* 3bc52c45bac26bf7ed1dc8d287ad1aeaed1250b6 (4.9) */
#ifndef DAXFS_MAGIC
#define DAXFS_MAGIC 0x64646178
#endif
/* 5ff193fbde20df5d80fec367cea3e7856c057320 (4.10) */
#ifndef RDTGROUP_SUPER_MAGIC
#define RDTGROUP_SUPER_MAGIC 0x7655821
#endif
/* a481f4d917835cad86701fc0d1e620c74bb5cd5f (4.13) */
#ifndef AAFS_MAGIC
#define AAFS_MAGIC 0x5a3c69f0
#endif
/* f044c8847bb61eff5e1e95b6f6bb950e7f4a73a4 (4.15) */
#ifndef AFS_FS_MAGIC
#define AFS_FS_MAGIC 0x6b414653
#endif
/* dddde68b8f06dd83486124b8d245e7bfb15c185d (4.20) */
#ifndef XFS_SUPER_MAGIC
#define XFS_SUPER_MAGIC 0x58465342
#endif
/* 3ad20fe393b31025bebfc2d76964561f65df48aa (5.0) */
#ifndef BINDERFS_SUPER_MAGIC
#define BINDERFS_SUPER_MAGIC 0x6c6f6f70
#endif
/* ed63bb1d1f8469586006a9ca63c42344401aa2ab (5.3) */
#ifndef DMA_BUF_MAGIC
#define DMA_BUF_MAGIC 0x444d4142
#endif
/* ea8157ab2ae5e914dd427e5cfab533b6da3819cd (5.3) */
#ifndef Z3FOLD_MAGIC
#define Z3FOLD_MAGIC 0x33
#endif
/* 47e4937a4a7ca4184fd282791dfee76c6799966a (5.4) */
#ifndef EROFS_SUPER_MAGIC_V1
#define EROFS_SUPER_MAGIC_V1 0xe0f5e1e2
#endif
/* fe030c9b85e6783bc52fe86449c0a4b8aa16c753 (5.5) */
#ifndef PPC_CMM_MAGIC
#define PPC_CMM_MAGIC 0xc7571590
#endif
/* 8dcc1a9d90c10fa4143e5c17821082e5e60e46a1 (5.6) */
#ifndef ZONEFS_MAGIC
#define ZONEFS_MAGIC 0x5a4f4653
#endif
/* 3234ac664a870e6ea69ae3a57d824cd7edbeacc5 (5.8) */
#ifndef DEVMEM_MAGIC
#define DEVMEM_MAGIC 0x454d444d
#endif
/* Not in mainline but included in Ubuntu */
#ifndef SHIFTFS_MAGIC
#define SHIFTFS_MAGIC 0x6a656a62
#endif
/* 1507f51255c9ff07d75909a84e7c0d7f3c4b2f49 (5.14) */
#ifndef SECRETMEM_MAGIC
#define SECRETMEM_MAGIC 0x5345434d
#endif
/* Not exposed yet. Defined at fs/fuse/inode.c */
#ifndef FUSE_SUPER_MAGIC
#define FUSE_SUPER_MAGIC 0x65735546
#endif
/* Not exposed yet. Defined at fs/fuse/control.c */
#ifndef FUSE_CTL_SUPER_MAGIC
#define FUSE_CTL_SUPER_MAGIC 0x65735543
#endif
/* Not exposed yet. Defined at fs/ceph/super.h */
#ifndef CEPH_SUPER_MAGIC
#define CEPH_SUPER_MAGIC 0x00c36400
#endif
/* Not exposed yet. Defined at fs/orangefs/orangefs-kernel.h */
#ifndef ORANGEFS_DEVREQ_MAGIC
#define ORANGEFS_DEVREQ_MAGIC 0x20030529
#endif
/* linux/gfs2_ondisk.h */
#ifndef GFS2_MAGIC
#define GFS2_MAGIC 0x01161970
#endif
/* Not exposed yet. Defined at fs/configfs/mount.c */
#ifndef CONFIGFS_MAGIC
#define CONFIGFS_MAGIC 0x62656570
#endif
/* Not exposed yet. Defined at fs/vboxsf/super.c */
#ifndef VBOXSF_SUPER_MAGIC
#define VBOXSF_SUPER_MAGIC 0x786f4256
#endif
/* Not exposed yet. Defined at fs/exfat/exfat_fs.h */
#ifndef EXFAT_SUPER_MAGIC
#define EXFAT_SUPER_MAGIC 0x2011BAB0UL
#endif

View File

@ -8,11 +8,13 @@
#include "chase-symlinks.h"
#include "fd-util.h"
#include "fileio.h"
#include "filesystems.h"
#include "fs-util.h"
#include "missing_stat.h"
#include "missing_syscall.h"
#include "mkdir.h"
#include "mountpoint-util.h"
#include "nulstr-util.h"
#include "parse-util.h"
#include "path-util.h"
#include "stat-util.h"
@ -366,48 +368,34 @@ bool fstype_is_network(const char *fstype) {
if (x)
fstype = x;
if (nulstr_contains(filesystem_sets[FILESYSTEM_SET_NETWORK].value, fstype))
return true;
/* Filesystems not present in the internal database */
return STR_IN_SET(fstype,
"afs",
"ceph",
"cifs",
"smb3",
"smbfs",
"sshfs",
"ncpfs",
"ncp",
"nfs",
"nfs4",
"gfs",
"gfs2",
"davfs",
"glusterfs",
"pvfs2", /* OrangeFS */
"ocfs2",
"lustre",
"davfs");
"sshfs");
}
bool fstype_is_api_vfs(const char *fstype) {
const FilesystemSet *fs;
FOREACH_POINTER(fs,
filesystem_sets + FILESYSTEM_SET_BASIC_API,
filesystem_sets + FILESYSTEM_SET_AUXILIARY_API,
filesystem_sets + FILESYSTEM_SET_PRIVILEGED_API,
filesystem_sets + FILESYSTEM_SET_TEMPORARY)
if (nulstr_contains(fs->value, fstype))
return true;
/* Filesystems not present in the internal database */
return STR_IN_SET(fstype,
"autofs",
"bpf",
"cgroup",
"cgroup2",
"configfs",
"cpuset",
"debugfs",
"devpts",
"devtmpfs",
"efivarfs",
"fusectl",
"hugetlbfs",
"mqueue",
"proc",
"pstore",
"ramfs",
"securityfs",
"sysfs",
"tmpfs",
"tracefs");
"devtmpfs");
}
bool fstype_is_blockdev_backed(const char *fstype) {

View File

@ -13,10 +13,13 @@
#include "errno-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "filesystems.h"
#include "fs-util.h"
#include "macro.h"
#include "missing_fs.h"
#include "missing_magic.h"
#include "missing_syscall.h"
#include "nulstr-util.h"
#include "parse-util.h"
#include "stat-util.h"
#include "string-util.h"
@ -198,19 +201,11 @@ int path_is_fs_type(const char *path, statfs_f_type_t magic_value) {
}
bool is_temporary_fs(const struct statfs *s) {
return is_fs_type(s, TMPFS_MAGIC) ||
is_fs_type(s, RAMFS_MAGIC);
return fs_in_group(s, FILESYSTEM_SET_TEMPORARY);
}
bool is_network_fs(const struct statfs *s) {
return is_fs_type(s, CIFS_MAGIC_NUMBER) ||
is_fs_type(s, CODA_SUPER_MAGIC) ||
is_fs_type(s, NCP_SUPER_MAGIC) ||
is_fs_type(s, NFS_SUPER_MAGIC) ||
is_fs_type(s, SMB_SUPER_MAGIC) ||
is_fs_type(s, V9FS_MAGIC) ||
is_fs_type(s, AFS_SUPER_MAGIC) ||
is_fs_type(s, OCFS2_SUPER_MAGIC);
return fs_in_group(s, FILESYSTEM_SET_NETWORK);
}
int fd_is_temporary_fs(int fd) {

381
src/core/bpf-lsm.c Normal file
View File

@ -0,0 +1,381 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include <errno.h>
#include <fcntl.h>
#include <linux/types.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#include "alloc-util.h"
#include "bpf-lsm.h"
#include "cgroup-util.h"
#include "fd-util.h"
#include "fileio.h"
#include "filesystems.h"
#include "log.h"
#include "manager.h"
#include "mkdir.h"
#include "nulstr-util.h"
#include "stat-util.h"
#include "strv.h"
#if BPF_FRAMEWORK
/* libbpf, clang and llc compile time dependencies are satisfied */
#include "bpf-dlopen.h"
#include "bpf-link.h"
#include "bpf/restrict_fs/restrict-fs-skel.h"
#define CGROUP_HASH_SIZE_MAX 2048
static struct restrict_fs_bpf *restrict_fs_bpf_free(struct restrict_fs_bpf *obj) {
/* restrict_fs_bpf__destroy handles object == NULL case */
(void) restrict_fs_bpf__destroy(obj);
return NULL;
}
DEFINE_TRIVIAL_CLEANUP_FUNC(struct restrict_fs_bpf *, restrict_fs_bpf_free);
static bool bpf_can_link_lsm_program(struct bpf_program *prog) {
_cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
assert(prog);
link = sym_bpf_program__attach_lsm(prog);
if (!link)
return -ENOMEM;
return 1;
}
static int prepare_restrict_fs_bpf(struct restrict_fs_bpf **ret_obj) {
struct restrict_fs_bpf *obj = 0;
_cleanup_close_ int inner_map_fd = -1;
int r;
assert(ret_obj);
obj = restrict_fs_bpf__open();
if (!obj)
return log_error_errno(errno, "Failed to open BPF object: %m");
/* TODO Maybe choose a number based on runtime information? */
r = sym_bpf_map__resize(obj->maps.cgroup_hash, CGROUP_HASH_SIZE_MAX);
if (r != 0)
return log_error_errno(r,
"Failed to resize BPF map '%s': %m",
sym_bpf_map__name(obj->maps.cgroup_hash));
/* Dummy map to satisfy the verifier */
inner_map_fd = sym_bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t), sizeof(uint32_t), 128, 0);
if (inner_map_fd < 0)
return log_error_errno(errno, "Failed to create BPF map: %m");
r = sym_bpf_map__set_inner_map_fd(obj->maps.cgroup_hash, inner_map_fd);
if (r < 0)
return log_error_errno(r, "Failed to set inner map fd: %m");
r = restrict_fs_bpf__load(obj);
if (r)
return log_error_errno(r, "Failed to load BPF object");
*ret_obj = TAKE_PTR(obj);
return 0;
}
static int mac_bpf_use(void) {
_cleanup_free_ char *lsm_list = NULL;
static int cached_use = -1;
int r;
if (cached_use >= 0)
return cached_use;
cached_use = 0;
r = read_one_line_file("/sys/kernel/security/lsm", &lsm_list);
if (r < 0) {
if (errno != ENOENT)
log_debug_errno(r, "Failed to read /sys/kernel/security/lsm, ignoring: %m");
return 0;
}
const char *p = lsm_list;
for (;;) {
_cleanup_free_ char *word = NULL;
r = extract_first_word(&p, &word, ",", 0);
if (r == 0)
break;
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
log_debug_errno(r, "Failed to parse /sys/kernel/security/lsm, ignoring: %m");
return 0;
}
if (streq(word, "bpf")) {
cached_use = 1;
break;
}
}
return cached_use;
}
int lsm_bpf_supported(void) {
_cleanup_(restrict_fs_bpf_freep) struct restrict_fs_bpf *obj = NULL;
static int supported = -1;
int r;
if (supported >= 0)
return supported;
r = dlopen_bpf();
if (r < 0) {
log_info_errno(r, "Failed to open libbpf, LSM BPF is not supported: %m");
return supported = 0;
}
r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
if (r < 0) {
log_warning_errno(r, "Can't determine whether the unified hierarchy is used: %m");
return supported = 0;
}
if (r == 0) {
log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"Not running with unified cgroup hierarchy, LSM BPF is not supported");
return supported = 0;
}
r = mac_bpf_use();
if (r < 0) {
log_warning_errno(r, "Can't determine whether the BPF LSM module is used: %m");
return supported = 0;
}
if (r == 0) {
log_info_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
"BPF LSM hook not enabled in the kernel, LSM BPF not supported");
return supported = 0;
}
r = prepare_restrict_fs_bpf(&obj);
if (r < 0)
return supported = 0;
r = bpf_can_link_lsm_program(obj->progs.restrict_filesystems);
if (r < 0) {
log_warning_errno(r, "Failed to link BPF program. Assuming BPF is not available: %m");
return supported = 0;
}
return supported = 1;
}
int lsm_bpf_setup(Manager *m) {
struct restrict_fs_bpf *obj = NULL;
_cleanup_(bpf_link_freep) struct bpf_link *link = NULL;
int r;
assert(m);
r = prepare_restrict_fs_bpf(&obj);
if (r < 0)
return r;
m->restrict_fs = obj;
link = sym_bpf_program__attach_lsm(m->restrict_fs->progs.restrict_filesystems);
r = sym_libbpf_get_error(link);
if (r != 0)
return log_error_errno(r, "Failed to link '%s' LSM BPF program: %m",
sym_bpf_program__name(m->restrict_fs->progs.restrict_filesystems));
log_info("LSM BPF program attached");
m->restrict_fs->links.restrict_filesystems = TAKE_PTR(link);
return 0;
}
int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, bool allow_list) {
int inner_map_fd = -1, outer_map_fd = -1;
uint32_t dummy_value = 1, zero = 0;
const char *fs;
const statfs_f_type_t *magic;
int r;
assert(filesystems);
assert(u);
inner_map_fd = sym_bpf_create_map(
BPF_MAP_TYPE_HASH,
sizeof(uint32_t),
sizeof(uint32_t),
128, /* Should be enough for all filesystem types */
0);
if (inner_map_fd < 0)
return log_unit_error_errno(u, errno, "Failed to create inner LSM map: %m");
outer_map_fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash);
if (outer_map_fd < 0)
return log_unit_error_errno(u, errno, "Failed to get BPF map fd: %m");
if (sym_bpf_map_update_elem(outer_map_fd, &u->cgroup_id, &inner_map_fd, BPF_ANY) != 0)
return log_unit_error_errno(u, errno, "Error populating LSM BPF map: %m");
uint32_t allow = allow_list;
/* Use key 0 to store whether this is an allow list or a deny list */
if (sym_bpf_map_update_elem(inner_map_fd, &zero, &allow, BPF_ANY) != 0)
return log_unit_error_errno(u, errno, "Error initializing BPF map: %m");
SET_FOREACH(fs, filesystems) {
r = fs_type_from_string(fs, &magic);
if (r < 0) {
log_unit_warning(u, "Invalid filesystem name '%s', ignoring.", fs);
continue;
}
log_unit_debug(u, "Restricting filesystem access to '%s'", fs);
for (int i = 0; i < FILESYSTEM_MAGIC_MAX; i++) {
if (magic[i] == 0)
break;
if (sym_bpf_map_update_elem(inner_map_fd, &magic[i], &dummy_value, BPF_ANY) != 0) {
r = log_unit_error_errno(u, errno, "Failed to update BPF map: %m");
if (sym_bpf_map_delete_elem(outer_map_fd, &u->cgroup_id) != 0)
log_unit_debug_errno(u, errno, "Failed to delete cgroup entry from LSM BPF map: %m");
return r;
}
}
}
return 0;
}
int lsm_bpf_cleanup(const Unit *u) {
int fd = -1;
assert(u);
assert(u->manager);
if (!lsm_bpf_supported())
return 0;
if (!u->manager->restrict_fs)
return 0;
fd = sym_bpf_map__fd(u->manager->restrict_fs->maps.cgroup_hash);
if (fd < 0)
return log_unit_error_errno(u, errno, "Failed to get BPF map fd: %m");
if (sym_bpf_map_delete_elem(fd, &u->cgroup_id) != 0)
return log_unit_debug_errno(u, errno, "Failed to delete cgroup entry from LSM BPF map: %m");
return 0;
}
int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
assert(unit);
assert(unit->manager);
if (!unit->manager->restrict_fs)
return -ENOMEDIUM;
return sym_bpf_map__fd(unit->manager->restrict_fs->maps.cgroup_hash);
}
void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
restrict_fs_bpf__destroy(prog);
}
#else /* ! BPF_FRAMEWORK */
int lsm_bpf_supported(void) {
return 0;
}
int lsm_bpf_setup(Manager *m) {
return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Failed to set up LSM BPF: %m");
}
int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, const bool allow_list) {
return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "Failed to restrict filesystems using LSM BPF: %m");
}
int lsm_bpf_cleanup(const Unit *u) {
return 0;
}
int lsm_bpf_map_restrict_fs_fd(Unit *unit) {
return -ENOMEDIUM;
}
void lsm_bpf_destroy(struct restrict_fs_bpf *prog) {
return;
}
#endif
int lsm_bpf_parse_filesystem(
const char *name,
Set **filesystems,
FilesystemParseFlags flags,
const char *unit,
const char *filename,
unsigned line) {
int r;
assert(name);
assert(filesystems);
if (name[0] == '@') {
const FilesystemSet *set;
const char *i;
set = filesystem_set_find(name);
if (!set) {
log_syntax(unit, flags & FILESYSTEM_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
"Unknown filesystem group, ignoring: %s", name);
return 0;
}
NULSTR_FOREACH(i, set->value) {
/* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
* away the FILESYSTEM_PARSE_LOG flag) since any issues in the group table are our own problem,
* not a problem in user configuration data and we shouldn't pretend otherwise by complaining
* about them. */
r = lsm_bpf_parse_filesystem(i, filesystems, flags &~ FILESYSTEM_PARSE_LOG, unit, filename, line);
if (r < 0)
return r;
}
} else {
/* If we previously wanted to forbid access to a filesystem and now
* we want to allow it, then remove it from the list. */
if (!(flags & FILESYSTEM_PARSE_INVERT) == !!(flags & FILESYSTEM_PARSE_ALLOW_LIST)) {
r = set_put_strdup(filesystems, name);
if (r < 0)
switch (r) {
case -ENOMEM:
return flags & FILESYSTEM_PARSE_LOG ? log_oom() : -ENOMEM;
case -EEXIST:
/* Alredy in set, ignore */
break;
default:
return r;
}
} else
free(set_remove(*filesystems, name));
}
return 0;
}

28
src/core/bpf-lsm.h Normal file
View File

@ -0,0 +1,28 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#pragma once
#include "hashmap.h"
typedef enum FilesystemParseFlags {
FILESYSTEM_PARSE_INVERT = 1 << 0,
FILESYSTEM_PARSE_ALLOW_LIST = 1 << 1,
FILESYSTEM_PARSE_LOG = 1 << 2,
} FilesystemParseFlags;
typedef struct Unit Unit;
typedef struct Manager Manager;
typedef struct restrict_fs_bpf restrict_fs_bpf;
int lsm_bpf_supported(void);
int lsm_bpf_setup(Manager *m);
int lsm_bpf_unit_restrict_filesystems(Unit *u, const Set *filesystems, bool allow_list);
int lsm_bpf_cleanup(const Unit *u);
int lsm_bpf_map_restrict_fs_fd(Unit *u);
void lsm_bpf_destroy(struct restrict_fs_bpf *prog);
int lsm_bpf_parse_filesystem(const char *name,
Set **filesystems,
FilesystemParseFlags flags,
const char *unit,
const char *filename,
unsigned line);

View File

@ -0,0 +1,14 @@
# SPDX-License-Identifier: LGPL-2.1+
if conf.get('BPF_FRAMEWORK') == 1
restrict_fs_skel_h = custom_target(
'restrict-fs-skel.h',
input : 'restrict-fs.bpf.c',
output : 'restrict-fs-skel.h',
command : [build_bpf_skel_py,
'--clang_exec', clang.path(),
'--llvm_strip_exec', llvm_strip.path(),
'--bpftool_exec', bpftool.path(),
'--arch', host_machine.cpu_family(),
'@INPUT@', '@OUTPUT@'])
endif

View File

@ -0,0 +1,78 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
/* The SPDX header above is actually correct in claiming this was
* LGPL-2.1-or-later, because it is. Since the kernel doesn't consider that
* compatible with GPL we will claim this to be GPL however, which should be
* fine given that LGPL-2.1-or-later downgrades to GPL if needed.
*/
#include <linux/types.h>
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
struct super_block {
long unsigned int s_magic;
} __attribute__((preserve_access_index));
struct inode {
struct super_block *i_sb;
} __attribute__((preserve_access_index));
struct file {
struct inode *f_inode;
} __attribute__((preserve_access_index));
/*
* max_entries is set from user space with the bpf_map__resize helper.
* */
struct {
__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
__type(key, uint64_t); /* cgroup ID */
__type(value, uint32_t); /* fs magic set */
} cgroup_hash SEC(".maps");
SEC("lsm/file_open")
int BPF_PROG(restrict_filesystems, struct file *file, int ret)
{
unsigned long magic_number;
uint64_t cgroup_id;
uint32_t *value, *magic_map, zero = 0, *is_allow;
/* ret is the return value from the previous BPF program or 0 if it's
* the first hook */
if (ret != 0)
return ret;
BPF_CORE_READ_INTO(&magic_number, file, f_inode, i_sb, s_magic);
cgroup_id = bpf_get_current_cgroup_id();
magic_map = bpf_map_lookup_elem(&cgroup_hash, &cgroup_id);
if (!magic_map)
return 0;
is_allow = bpf_map_lookup_elem(magic_map, &zero);
if (!is_allow)
/* Malformed map, it doesn't include whether it's an allow list
* or a deny list. Allow. */
return 0;
if (*is_allow) {
/* Allow-list: Allow access only if magic_number present in inner map */
if (!bpf_map_lookup_elem(magic_map, &magic_number))
return -EPERM;
} else {
/* Deny-list: Allow access only if magic_number is not present in inner map */
if (bpf_map_lookup_elem(magic_map, &magic_number))
return -EPERM;
}
return 0;
}
static const char _license[] SEC("license") = "GPL";

View File

@ -37,6 +37,12 @@
#include "string-util.h"
#include "virt.h"
#if BPF_FRAMEWORK
#include "bpf-dlopen.h"
#include "bpf-link.h"
#include "bpf/restrict_fs/restrict-fs-skel.h"
#endif
#define CGROUP_CPU_QUOTA_DEFAULT_PERIOD_USEC ((usec_t) 100 * USEC_PER_MSEC)
/* Returns the log level to use when cgroup attribute writes fail. When an attribute is missing or we have access
@ -2102,6 +2108,8 @@ static int unit_update_cgroup(
bool created, is_root_slice;
CGroupMask migrate_mask = 0;
_cleanup_free_ char *cgroup_full_path = NULL;
uint64_t cgroup_id = 0;
int r;
assert(u);
@ -2120,6 +2128,18 @@ static int unit_update_cgroup(
return log_unit_error_errno(u, r, "Failed to create cgroup %s: %m", empty_to_root(u->cgroup_path));
created = r;
if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &cgroup_full_path);
if (r == 0) {
r = cg_path_get_cgroupid(cgroup_full_path, &cgroup_id);
if (r < 0)
log_unit_warning_errno(u, r, "Failed to get cgroup ID on cgroup %s, ignoring: %m", cgroup_full_path);
} else
log_unit_warning_errno(u, r, "Failed to get full cgroup path on cgroup %s, ignoring: %m", empty_to_root(u->cgroup_path));
u->cgroup_id = cgroup_id;
}
/* Start watching it */
(void) unit_watch_cgroup(u);
(void) unit_watch_cgroup_memory(u);
@ -2722,6 +2742,10 @@ void unit_prune_cgroup(Unit *u) {
(void) unit_get_cpu_usage(u, NULL); /* Cache the last CPU usage value before we destroy the cgroup */
#if BPF_FRAMEWORK
(void) lsm_bpf_cleanup(u); /* Remove cgroup from the global LSM BPF map */
#endif
is_root_slice = unit_has_name(u, SPECIAL_ROOT_SLICE);
r = cg_trim_everywhere(u->manager->cgroup_supported, u->cgroup_path, !is_root_slice);

View File

@ -3,6 +3,7 @@
#include <stdbool.h>
#include "bpf-lsm.h"
#include "cgroup-util.h"
#include "cpu-set-util.h"
#include "list.h"

View File

@ -37,6 +37,7 @@
#endif
#include "securebits-util.h"
#include "specifier.h"
#include "stat-util.h"
#include "strv.h"
#include "syslog-util.h"
#include "unit-printf.h"
@ -681,6 +682,46 @@ static int property_get_input_data(
return sd_bus_message_append_array(reply, 'y', c->stdin_data, c->stdin_data_size);
}
static int property_get_restrict_filesystems(
sd_bus *bus,
const char *path,
const char *interface,
const char *property,
sd_bus_message *reply,
void *userdata,
sd_bus_error *error) {
ExecContext *c = userdata;
_cleanup_free_ char **l = NULL;
int r;
assert(bus);
assert(reply);
assert(c);
r = sd_bus_message_open_container(reply, 'r', "bas");
if (r < 0)
return r;
r = sd_bus_message_append(reply, "b", c->restrict_filesystems_allow_list);
if (r < 0)
return r;
#if HAVE_LIBBPF
l = set_get_strv(c->restrict_filesystems);
if (!l)
return -ENOMEM;
#endif
strv_sort(l);
r = sd_bus_message_append_strv(reply, l);
if (r < 0)
return r;
return sd_bus_message_close_container(reply);
}
static int property_get_bind_paths(
sd_bus *bus,
const char *path,
@ -1199,6 +1240,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictSUIDSGID", "b", bus_property_get_bool, offsetof(ExecContext, restrict_suid_sgid), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictNamespaces", "t", bus_property_get_ulong, offsetof(ExecContext, restrict_namespaces), SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("RestrictFileSystems", "(bas)", property_get_restrict_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("BindReadOnlyPaths", "a(ssbt)", property_get_bind_paths, 0, SD_BUS_VTABLE_PROPERTY_CONST),
SD_BUS_PROPERTY("TemporaryFileSystem", "a(ss)", property_get_temporary_filesystems, 0, SD_BUS_VTABLE_PROPERTY_CONST),
@ -1875,6 +1917,64 @@ int bus_exec_context_set_transient_property(
if (streq(name, "RestrictNamespaces"))
return bus_set_transient_namespace_flag(u, name, &c->restrict_namespaces, message, flags, error);
if (streq(name, "RestrictFileSystems")) {
int allow_list;
_cleanup_strv_free_ char **l = NULL;
r = sd_bus_message_enter_container(message, 'r', "bas");
if (r < 0)
return r;
r = sd_bus_message_read(message, "b", &allow_list);
if (r < 0)
return r;
r = sd_bus_message_read_strv(message, &l);
if (r < 0)
return r;
r = sd_bus_message_exit_container(message);
if (r < 0)
return r;
if (!UNIT_WRITE_FLAGS_NOOP(flags)) {
_cleanup_free_ char *joined = NULL;
FilesystemParseFlags invert_flag = allow_list ? 0 : FILESYSTEM_PARSE_INVERT;
char **s;
if (strv_isempty(l)) {
c->restrict_filesystems_allow_list = false;
c->restrict_filesystems = set_free(c->restrict_filesystems);
unit_write_setting(u, flags, name, "RestrictFileSystems=");
return 1;
}
if (!c->restrict_filesystems)
c->restrict_filesystems_allow_list = allow_list;
STRV_FOREACH(s, l) {
r = lsm_bpf_parse_filesystem(
*s,
&c->restrict_filesystems,
FILESYSTEM_PARSE_LOG|
(invert_flag ? FILESYSTEM_PARSE_INVERT : 0)|
(c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
u->id, NULL, 0);
if (r < 0)
return r;
}
joined = strv_join(l, " ");
if (!joined)
return -ENOMEM;
unit_write_settingf(u, flags, name, "%s=%s%s", name, allow_list ? "" : "~", joined);
}
return 1;
}
if (streq(name, "MountFlags"))
return bus_set_transient_mount_flags(u, name, &c->mount_flags, message, flags, error);

View File

@ -41,6 +41,7 @@
#endif
#include "async.h"
#include "barrier.h"
#include "bpf-lsm.h"
#include "cap-list.h"
#include "capability-util.h"
#include "cgroup-setup.h"
@ -1685,6 +1686,29 @@ static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
return seccomp_restrict_namespaces(c->restrict_namespaces);
}
#if HAVE_LIBBPF
static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
if (lsm_bpf_supported())
return false;
log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
return true;
}
static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
assert(u);
assert(c);
if (!exec_context_restrict_filesystems_set(c))
return 0;
if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
return 0;
return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
}
#endif
static int apply_lock_personality(const Unit* u, const ExecContext *c) {
unsigned long personality;
int r;
@ -3813,7 +3837,7 @@ static int exec_child(
/* In case anything used libc syslog(), close this here, too */
closelog();
int keep_fds[n_fds + 2];
int keep_fds[n_fds + 3];
memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
n_keep_fds = n_fds;
@ -3823,6 +3847,24 @@ static int exec_child(
return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
}
#if HAVE_LIBBPF
if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
int bpf_map_fd = -1;
bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
if (bpf_map_fd < 0) {
*exit_status = EXIT_FDS;
return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
}
r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
if (r < 0) {
*exit_status = EXIT_FDS;
return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
}
}
#endif
r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
if (r < 0) {
*exit_status = EXIT_FDS;
@ -4682,6 +4724,15 @@ static int exec_child(
return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
}
#endif
#if HAVE_LIBBPF
r = apply_restrict_filesystems(unit, context);
if (r < 0) {
*exit_status = EXIT_BPF;
return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
}
#endif
}
if (!strv_isempty(context->unset_environment)) {
@ -4967,6 +5018,8 @@ void exec_context_done(ExecContext *c) {
c->apparmor_profile = mfree(c->apparmor_profile);
c->smack_process_label = mfree(c->smack_process_label);
c->restrict_filesystems = set_free(c->restrict_filesystems);
c->syscall_filter = hashmap_free(c->syscall_filter);
c->syscall_archs = set_free(c->syscall_archs);
c->address_families = set_free(c->address_families);
@ -5734,6 +5787,12 @@ void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
prefix, strna(s));
}
#if HAVE_LIBBPF
if (exec_context_restrict_filesystems_set(c))
SET_FOREACH(e, c->restrict_filesystems)
fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
#endif
if (c->network_namespace_path)
fprintf(f,
"%sNetworkNamespacePath: %s\n",

View File

@ -314,6 +314,9 @@ struct ExecContext {
unsigned long restrict_namespaces; /* The CLONE_NEWxyz flags permitted to the unit's processes */
Set *restrict_filesystems;
bool restrict_filesystems_allow_list:1;
Hashmap *syscall_filter;
Set *syscall_archs;
int syscall_errno;
@ -342,6 +345,13 @@ static inline bool exec_context_restrict_namespaces_set(const ExecContext *c) {
return (c->restrict_namespaces & NAMESPACE_FLAGS_ALL) != NAMESPACE_FLAGS_ALL;
}
static inline bool exec_context_restrict_filesystems_set(const ExecContext *c) {
assert(c);
return c->restrict_filesystems_allow_list ||
!set_isempty(c->restrict_filesystems);
}
static inline bool exec_context_with_rootfs(const ExecContext *c) {
assert(c);

View File

@ -81,6 +81,7 @@
{{type}}.RestrictAddressFamilies, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
{{type}}.LockPersonality, config_parse_warn_compat, DISABLED_CONFIGURATION, 0
{% endif %}
{{type}}.RestrictFileSystems, config_parse_restrict_filesystems, 0, offsetof({{type}}, exec_context)
{{type}}.LimitCPU, config_parse_rlimit, RLIMIT_CPU, offsetof({{type}}, exec_context.rlimit)
{{type}}.LimitFSIZE, config_parse_rlimit, RLIMIT_FSIZE, offsetof({{type}}, exec_context.rlimit)
{{type}}.LimitDATA, config_parse_rlimit, RLIMIT_DATA, offsetof({{type}}, exec_context.rlimit)

View File

@ -19,6 +19,7 @@
#include "all-units.h"
#include "alloc-util.h"
#include "bpf-firewall.h"
#include "bpf-lsm.h"
#include "bpf-program.h"
#include "bpf-socket-bind.h"
#include "bus-error.h"
@ -3597,6 +3598,76 @@ int config_parse_restrict_namespaces(
}
#endif
int config_parse_restrict_filesystems(
const char *unit,
const char *filename,
unsigned line,
const char *section,
unsigned section_line,
const char *lvalue,
int ltype,
const char *rvalue,
void *data,
void *userdata) {
ExecContext *c = data;
bool invert = false;
int r;
assert(filename);
assert(lvalue);
assert(rvalue);
assert(data);
if (isempty(rvalue)) {
/* Empty assignment resets the list */
c->restrict_filesystems = set_free(c->restrict_filesystems);
c->restrict_filesystems_allow_list = false;
return 0;
}
if (rvalue[0] == '~') {
invert = true;
rvalue++;
}
if (!c->restrict_filesystems) {
if (invert)
/* Allow everything but the ones listed */
c->restrict_filesystems_allow_list = false;
else
/* Allow nothing but the ones listed */
c->restrict_filesystems_allow_list = true;
}
for (const char *p = rvalue;;) {
_cleanup_free_ char *word = NULL;
r = extract_first_word(&p, &word, NULL, EXTRACT_UNQUOTE);
if (r == 0)
break;
if (r == -ENOMEM)
return log_oom();
if (r < 0) {
log_syntax(unit, LOG_WARNING, filename, line, r,
"Trailing garbage in %s, ignoring: %s", lvalue, rvalue);
break;
}
r = lsm_bpf_parse_filesystem(
word,
&c->restrict_filesystems,
FILESYSTEM_PARSE_LOG|
(invert ? FILESYSTEM_PARSE_INVERT : 0)|
(c->restrict_filesystems_allow_list ? FILESYSTEM_PARSE_ALLOW_LIST : 0),
unit, filename, line);
if (r < 0)
return r;
}
return 0;
}
int config_parse_unit_slice(
const char *unit,
const char *filename,
@ -6030,6 +6101,7 @@ void unit_dump_config_items(FILE *f) {
{ config_parse_address_families, "FAMILIES" },
{ config_parse_restrict_namespaces, "NAMESPACES" },
#endif
{ config_parse_restrict_filesystems, "FILESYSTEMS" },
{ config_parse_cpu_shares, "SHARES" },
{ config_parse_cg_weight, "WEIGHT" },
{ config_parse_memory_limit, "LIMIT" },

View File

@ -113,6 +113,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_sec_fix_0);
CONFIG_PARSER_PROTOTYPE(config_parse_user_group_compat);
CONFIG_PARSER_PROTOTYPE(config_parse_user_group_strv_compat);
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_namespaces);
CONFIG_PARSER_PROTOTYPE(config_parse_restrict_filesystems);
CONFIG_PARSER_PROTOTYPE(config_parse_bind_paths);
CONFIG_PARSER_PROTOTYPE(config_parse_exec_keyring_mode);
CONFIG_PARSER_PROTOTYPE(config_parse_protect_proc);

View File

@ -22,6 +22,9 @@
#include "alloc-util.h"
#include "apparmor-setup.h"
#include "architecture.h"
#if HAVE_LIBBPF
#include "bpf-lsm.h"
#endif
#include "build.h"
#include "bus-error.h"
#include "bus-util.h"

View File

@ -930,6 +930,14 @@ int manager_new(UnitFileScope scope, ManagerTestRunFlags test_run_flags, Manager
r = manager_setup_sigchld_event_source(m);
if (r < 0)
return r;
#if HAVE_LIBBPF
if (MANAGER_IS_SYSTEM(m) && lsm_bpf_supported()) {
r = lsm_bpf_setup(m);
if (r < 0)
return r;
}
#endif
}
if (test_run_flags == 0) {
@ -1535,6 +1543,10 @@ Manager* manager_free(Manager *m) {
m->prefix[dt] = mfree(m->prefix[dt]);
free(m->received_credentials);
#if BPF_FRAMEWORK
lsm_bpf_destroy(m->restrict_fs);
#endif
return mfree(m);
}

View File

@ -450,6 +450,9 @@ struct Manager {
* we're a user manager, this object manages the client connection from the user manager to
* systemd-oomd to report changes in ManagedOOM settings (systemd client - oomd server). */
Varlink *managed_oom_varlink;
/* Reference to RestrictFileSystems= BPF program */
struct restrict_fs_bpf *restrict_fs;
};
static inline usec_t manager_default_timeout_abort_usec(Manager *m) {

View File

@ -13,6 +13,8 @@ libcore_sources = '''
bpf-firewall.h
bpf-foreign.c
bpf-foreign.h
bpf-lsm.c
bpf-lsm.h
bpf-socket-bind.c
bpf-socket-bind.h
cgroup.c
@ -134,6 +136,8 @@ libcore_sources = '''
subdir('bpf/socket_bind')
if conf.get('BPF_FRAMEWORK') == 1
libcore_sources += [socket_bind_skel_h]
subdir('bpf/restrict_fs')
libcore_sources += [restrict_fs_skel_h]
endif
subdir('bpf/restrict_ifaces')

View File

@ -294,6 +294,7 @@ typedef struct Unit {
/* Counterparts in the cgroup filesystem */
char *cgroup_path;
uint64_t cgroup_id;
CGroupMask cgroup_realized_mask; /* In which hierarchies does this unit's cgroup exist? (only relevant on cgroup v1) */
CGroupMask cgroup_enabled_mask; /* Which controllers are enabled (or more correctly: enabled for the children) for this unit's cgroup? (only relevant on cgroup v2) */
CGroupMask cgroup_invalidated_mask; /* A mask specifying controllers which shall be considered invalidated, and require re-realization */

View File

@ -8,13 +8,17 @@
static void *bpf_dl = NULL;
struct bpf_link* (*sym_bpf_program__attach_cgroup)(struct bpf_program *, int);
struct bpf_link* (*sym_bpf_program__attach_lsm)(struct bpf_program *);
long (*sym_libbpf_get_error)(const void *);
int (*sym_bpf_link__fd)(const struct bpf_link *);
int (*sym_bpf_link__destroy)(struct bpf_link *);
int (*sym_bpf_map__fd)(const struct bpf_map *);
const char* (*sym_bpf_map__name)(const struct bpf_map *);
int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags);
int (*sym_bpf_map__resize)(struct bpf_map *, __u32);
int (*sym_bpf_map_update_elem)(int, const void *, const void *, __u64);
int (*sym_bpf_map_delete_elem)(int, const void *);
int (*sym_bpf_map__set_inner_map_fd)(struct bpf_map *, int);
int (*sym_bpf_object__open_skeleton)(struct bpf_object_skeleton *, const struct bpf_object_open_opts *);
int (*sym_bpf_object__load_skeleton)(struct bpf_object_skeleton *);
int (*sym_bpf_object__attach_skeleton)(struct bpf_object_skeleton *);
@ -30,8 +34,11 @@ int dlopen_bpf(void) {
DLSYM_ARG(bpf_link__fd),
DLSYM_ARG(bpf_map__fd),
DLSYM_ARG(bpf_map__name),
DLSYM_ARG(bpf_create_map),
DLSYM_ARG(bpf_map__resize),
DLSYM_ARG(bpf_map_update_elem),
DLSYM_ARG(bpf_map_delete_elem),
DLSYM_ARG(bpf_map__set_inner_map_fd),
DLSYM_ARG(bpf_object__open_skeleton),
DLSYM_ARG(bpf_object__load_skeleton),
DLSYM_ARG(bpf_object__attach_skeleton),
@ -39,6 +46,7 @@ int dlopen_bpf(void) {
DLSYM_ARG(bpf_object__destroy_skeleton),
DLSYM_ARG(bpf_probe_prog_type),
DLSYM_ARG(bpf_program__attach_cgroup),
DLSYM_ARG(bpf_program__attach_lsm),
DLSYM_ARG(bpf_program__name),
DLSYM_ARG(libbpf_get_error));
}

View File

@ -7,13 +7,17 @@
#include <bpf/libbpf.h>
extern struct bpf_link* (*sym_bpf_program__attach_cgroup)(struct bpf_program *, int);
extern struct bpf_link* (*sym_bpf_program__attach_lsm)(struct bpf_program *);
extern long (*sym_libbpf_get_error)(const void *);
extern int (*sym_bpf_link__fd)(const struct bpf_link *);
extern int (*sym_bpf_link__destroy)(struct bpf_link *);
extern int (*sym_bpf_map__fd)(const struct bpf_map *);
extern const char* (*sym_bpf_map__name)(const struct bpf_map *);
extern int (*sym_bpf_create_map)(enum bpf_map_type, int key_size, int value_size, int max_entries, __u32 map_flags);
extern int (*sym_bpf_map__resize)(struct bpf_map *, __u32);
extern int (*sym_bpf_map_update_elem)(int, const void *, const void *, __u64);
extern int (*sym_bpf_map_delete_elem)(int, const void *);
extern int (*sym_bpf_map__set_inner_map_fd)(struct bpf_map *, int);
/* The *_skeleton APIs are autogenerated by bpftool, the targets can be found
* in ./build/src/core/bpf/socket_bind/socket-bind.skel.h */
extern int (*sym_bpf_object__open_skeleton)(struct bpf_object_skeleton *, const struct bpf_object_open_opts *);

View File

@ -1386,6 +1386,7 @@ static int bus_append_execute_property(sd_bus_message *m, const char *field, con
}
if (STR_IN_SET(field, "RestrictAddressFamilies",
"RestrictFileSystems",
"SystemCallFilter",
"SystemCallLog",
"RestrictNetworkInterfaces")) {

View File

@ -18,8 +18,8 @@ const ExitStatusMapping exit_status_mappings[256] = {
* 863 (Currently unmapped)
* 6478 BSD defined exit codes
* 79199 (Currently unmapped)
* 200243 systemd's private error codes (might be extended to 254 in future development)
* 244254 (Currently unmapped, but see above)
* 200244 systemd's private error codes (might be extended to 254 in future development)
* 245254 (Currently unmapped, but see above)
*
* 255 EXIT_EXCEPTION (We use this to propagate exit-by-signal events. It's frequently used by others apps (like bash)
* to indicate exit reason that cannot really be expressed in a single exit status value such as a propagated
@ -71,6 +71,7 @@ const ExitStatusMapping exit_status_mappings[256] = {
[EXIT_CONFIGURATION_DIRECTORY] = { "CONFIGURATION_DIRECTORY", EXIT_STATUS_SYSTEMD },
[EXIT_NUMA_POLICY] = { "NUMA_POLICY", EXIT_STATUS_SYSTEMD },
[EXIT_CREDENTIALS] = { "CREDENTIALS", EXIT_STATUS_SYSTEMD },
[EXIT_BPF] = { "BPF", EXIT_STATUS_SYSTEMD },
[EXIT_EXCEPTION] = { "EXCEPTION", EXIT_STATUS_SYSTEMD },

View File

@ -71,6 +71,7 @@ enum {
EXIT_CONFIGURATION_DIRECTORY,
EXIT_NUMA_POLICY,
EXIT_CREDENTIALS,
EXIT_BPF,
EXIT_EXCEPTION = 255, /* Whenever we want to propagate an abnormal/signal exit, in line with bash */
};

View File

@ -1023,7 +1023,7 @@ static int print_property(const char *name, const char *expected_value, sd_bus_m
return 1;
} else if (STR_IN_SET(name, "SystemCallFilter", "SystemCallLog", "RestrictAddressFamilies", "RestrictNetworkInterfaces")) {
} else if (STR_IN_SET(name, "SystemCallFilter", "SystemCallLog", "RestrictAddressFamilies", "RestrictNetworkInterfaces", "RestrictFileSystems")) {
_cleanup_strv_free_ char **l = NULL;
int allow_list;

View File

@ -357,6 +357,17 @@ tests += [
[],
core_includes],
[['src/test/test-bpf-lsm.c'],
[libcore,
libshared],
[libmount,
threads,
librt,
libseccomp,
libselinux,
libblkid],
core_includes],
[['src/test/test-watch-pid.c'],
[libcore,
libshared],

109
src/test/test-bpf-lsm.c Normal file
View File

@ -0,0 +1,109 @@
/* SPDX-License-Identifier: LGPL-2.1-or-later */
#include "bpf-lsm.h"
#include "load-fragment.h"
#include "manager.h"
#include "process-util.h"
#include "rlimit-util.h"
#include "rm-rf.h"
#include "service.h"
#include "strv.h"
#include "tests.h"
#include "unit.h"
#include "virt.h"
static int test_restrict_filesystems(Manager *m, const char *unit_name, const char *file_path, char **allowed_filesystems) {
_cleanup_free_ char *exec_start = NULL;
_cleanup_(unit_freep) Unit *u = NULL;
ExecContext *ec = NULL;
char **allow_filesystem;
int cld_code, r;
assert_se(u = unit_new(m, sizeof(Service)));
assert_se(unit_add_name(u, unit_name) == 0);
assert_se(ec = unit_get_exec_context(u));
STRV_FOREACH(allow_filesystem, allowed_filesystems) {
r = config_parse_restrict_filesystems(
u->id, "filename", 1, "Service", 1, "RestrictFileSystems", 0,
*allow_filesystem, ec, u);
if (r < 0)
return log_unit_error_errno(u, r, "Failed to parse RestrictFileSystems: %m");
}
assert_se(exec_start = strjoin("cat ", file_path));
r = config_parse_exec(u->id, "filename", 1, "Service", 1, "ExecStart",
SERVICE_EXEC_START, exec_start, SERVICE(u)->exec_command, u);
if (r < 0)
return log_error_errno(r, "Failed to parse ExecStart");
SERVICE(u)->type = SERVICE_ONESHOT;
u->load_state = UNIT_LOADED;
r = unit_start(u);
if (r < 0)
return log_error_errno(r, "Unit start failed %m");
while (!IN_SET(SERVICE(u)->state, SERVICE_DEAD, SERVICE_FAILED)) {
r = sd_event_run(m->event, UINT64_MAX);
if (r < 0)
return log_error_errno(errno, "Event run failed %m");
}
cld_code = SERVICE(u)->exec_command[SERVICE_EXEC_START]->exec_status.code;
if (cld_code != CLD_EXITED) {
return log_error_errno(-SYNTHETIC_ERRNO(EBUSY), "ExecStart didn't exited, code='%s'", sigchld_code_to_string(cld_code));
}
if (SERVICE(u)->state != SERVICE_DEAD) {
return log_error_errno(-SYNTHETIC_ERRNO(EBUSY), "Service is not dead");
}
return 0;
}
int main(int argc, char *argv[]) {
_cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL;
_cleanup_(manager_freep) Manager *m = NULL;
_cleanup_free_ char *unit_dir = NULL;
struct rlimit rl;
int r;
test_setup_logging(LOG_DEBUG);
if (getuid() != 0)
return log_tests_skipped("not running as root");
assert_se(getrlimit(RLIMIT_MEMLOCK, &rl) >= 0);
rl.rlim_cur = rl.rlim_max = MAX(rl.rlim_max, CAN_MEMLOCK_SIZE);
(void) setrlimit_closest(RLIMIT_MEMLOCK, &rl);
if (!can_memlock())
return log_tests_skipped("Can't use mlock(), skipping.");
r = lsm_bpf_supported();
if (r <= 0)
return log_tests_skipped("LSM BPF hooks are not supported");
r = enter_cgroup_subroot(NULL);
if (r == -ENOMEDIUM)
return log_tests_skipped("cgroupfs not available");
assert_se(get_testdata_dir("units", &unit_dir) >= 0);
assert_se(set_unit_path(unit_dir) >= 0);
assert_se(runtime_dir = setup_fake_runtime_dir());
assert_se(manager_new(UNIT_FILE_SYSTEM, MANAGER_TEST_RUN_BASIC, &m) >= 0);
assert_se(manager_startup(m, NULL, NULL, NULL) >= 0);
/* We need to enable access to the filesystem where the binary is so we
* add @common-block */
assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/tracing/printk_formats", STRV_MAKE("@common-block")) < 0);
assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/tracing/printk_formats", STRV_MAKE("tracefs", "@common-block")) >= 0);
assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/tracing/printk_formats", STRV_MAKE("tracefs", "@common-block", "~tracefs")) < 0);
assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/debug/sleep_time", STRV_MAKE("@common-block")) < 0);
assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/debug/sleep_time", STRV_MAKE("debugfs", "@common-block")) >= 0);
assert_se(test_restrict_filesystems(m, "restrict_filesystems_test.service", "/sys/kernel/debug/sleep_time", STRV_MAKE("~debugfs")) < 0);
return 0;
}

View File

@ -144,6 +144,7 @@ ReadWritePaths=
RemoveIPC=
RestartKillSignal=
RestrictAddressFamilies=
RestrictFileSystems=
RestrictNamespaces=
RestrictNetworkInterfaces=
RestrictRealtime=

View File

@ -275,6 +275,7 @@ RestartKillSignal=
RestartPreventExitStatus=
RestartSec=
RestrictAddressFamilies=
RestrictFileSystems=
RestrictNamespaces=
RestrictNetworkInterfaces=
RestrictRealtime=

View File

@ -180,6 +180,7 @@ RemoveIPC=
RemoveOnStop=
RestartKillSignal=
RestrictAddressFamilies=
RestrictFileSystems=
RestrictNamespaces=
RestrictNetworkInterfaces=
RestrictRealtime=

View File

@ -141,6 +141,7 @@ ReadWritePaths=
RemoveIPC=
RestartKillSignal=
RestrictAddressFamilies=
RestrictFileSystems=
RestrictNamespaces=
RestrictNetworkInterfaces=
RestrictRealtime=