mirror of
https://github.com/systemd/systemd.git
synced 2025-01-10 05:18:17 +03:00
nspawn: replace syscall blacklist by a whitelist
Let's lock things down a bit, and maintain a list of what's permitted rather than a list of what's prohibited in nspawn (also to make things a bit more like Docker and friends). Note that this slightly alters the effect of --system-call-filter=, as now the negative list now takes precedence over the positive list. However, given that the option is just a few days old and not included in any released version it should be fine to change it at this point in time. Note that the whitelist is good chunk more restrictive thatn the previous blacklist. Specifically: - fanotify is not permitted (given the buffer size issues it's problematic in containers) - nfsservctl is not permitted (NFS server support is not virtualized) - pkey_xyz stuff is not permitted (really new stuff I don't grok) - @cpu-emulation is prohibited (untested legacy stuff mostly, and if people really want to run dosemu in nspawn, they should use --system-call-filter=@cpu-emulation and all should be good)
This commit is contained in:
parent
cff7bff880
commit
96bedbe2e5
@ -723,9 +723,9 @@
|
|||||||
system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
|
system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
|
||||||
listed system calls are prohibited. If this command line option is used multiple times the configured lists are
|
listed system calls are prohibited. If this command line option is used multiple times the configured lists are
|
||||||
combined. If both a positive and a negative list (that is one system call list without and one with the
|
combined. If both a positive and a negative list (that is one system call list without and one with the
|
||||||
<literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
|
<literal>~</literal> prefix) are configured, the negative list takes precedence over the positive list. Note
|
||||||
that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
|
that <command>systemd-nspawn</command> always implements a system call whitelist (as opposed to a blacklist),
|
||||||
and this command line option hence adds or removes entries from the default blacklist, depending on the
|
and this command line option hence adds or removes entries from the default whitelist, depending on the
|
||||||
<literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
|
<literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
|
||||||
capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
|
capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
@ -47,47 +47,154 @@ static int seccomp_add_default_syscall_filter(
|
|||||||
static const struct {
|
static const struct {
|
||||||
uint64_t capability;
|
uint64_t capability;
|
||||||
const char* name;
|
const char* name;
|
||||||
} blacklist[] = {
|
} whitelist[] = {
|
||||||
{ 0, "@obsolete" },
|
/* Let's use set names where we can */
|
||||||
{ 0, "@keyring" }, /* keyring is not namespaced */
|
{ 0, "@basic-io" },
|
||||||
{ 0, "bpf" },
|
{ 0, "@credentials" },
|
||||||
{ 0, "kexec_file_load" },
|
{ 0, "@default" },
|
||||||
{ 0, "kexec_load" },
|
{ 0, "@file-system" },
|
||||||
{ 0, "lookup_dcookie" },
|
{ 0, "@io-event" },
|
||||||
{ 0, "open_by_handle_at" },
|
{ 0, "@ipc" },
|
||||||
{ 0, "perf_event_open" },
|
{ 0, "@mount" },
|
||||||
{ 0, "quotactl" },
|
{ 0, "@network-io" },
|
||||||
{ 0, "@swap" },
|
{ 0, "@process" },
|
||||||
{ CAP_SYSLOG, "syslog" },
|
{ 0, "@resources" },
|
||||||
{ CAP_SYS_MODULE, "@module" },
|
{ 0, "@setuid" },
|
||||||
{ CAP_SYS_PACCT, "acct" },
|
{ 0, "@signal" },
|
||||||
{ CAP_SYS_PTRACE, "process_vm_readv" },
|
{ 0, "@timer" },
|
||||||
{ CAP_SYS_PTRACE, "process_vm_writev" },
|
|
||||||
{ CAP_SYS_PTRACE, "ptrace" },
|
/* The following four are sets we optionally enable, in case the caps have been configured for it */
|
||||||
{ CAP_SYS_RAWIO, "@raw-io" },
|
{ CAP_SYS_TIME, "@clock" },
|
||||||
{ CAP_SYS_TIME, "@clock" },
|
{ CAP_SYS_MODULE, "@module" },
|
||||||
|
{ CAP_SYS_RAWIO, "@raw-io" },
|
||||||
|
{ CAP_IPC_LOCK, "@memlock" },
|
||||||
|
|
||||||
|
/* Plus a good set of additional syscalls which are not part of any of the groups above */
|
||||||
|
{ 0, "brk" },
|
||||||
|
{ 0, "capset" },
|
||||||
|
{ 0, "chown" },
|
||||||
|
{ 0, "chown32" },
|
||||||
|
{ 0, "copy_file_range" },
|
||||||
|
{ 0, "fadvise64" },
|
||||||
|
{ 0, "fadvise64_64" },
|
||||||
|
{ 0, "fchown" },
|
||||||
|
{ 0, "fchown32" },
|
||||||
|
{ 0, "fchownat" },
|
||||||
|
{ 0, "fdatasync" },
|
||||||
|
{ 0, "flock" },
|
||||||
|
{ 0, "fsync" },
|
||||||
|
{ 0, "get_mempolicy" },
|
||||||
|
{ 0, "getcpu" },
|
||||||
|
{ 0, "getpriority" },
|
||||||
|
{ 0, "getrandom" },
|
||||||
|
{ 0, "io_cancel" },
|
||||||
|
{ 0, "io_destroy" },
|
||||||
|
{ 0, "io_getevents" },
|
||||||
|
{ 0, "io_setup" },
|
||||||
|
{ 0, "io_submit" },
|
||||||
|
{ 0, "ioctl" },
|
||||||
|
{ 0, "ioprio_get" },
|
||||||
|
{ 0, "kcmp" },
|
||||||
|
{ 0, "lchown" },
|
||||||
|
{ 0, "lchown32" },
|
||||||
|
{ 0, "madvise" },
|
||||||
|
{ 0, "mincore" },
|
||||||
|
{ 0, "mprotect" },
|
||||||
|
{ 0, "mremap" },
|
||||||
|
{ 0, "msync" },
|
||||||
|
{ 0, "name_to_handle_at" },
|
||||||
|
{ 0, "oldolduname" },
|
||||||
|
{ 0, "olduname" },
|
||||||
|
{ 0, "personality" },
|
||||||
|
{ 0, "preadv2" },
|
||||||
|
{ 0, "pwritev2" },
|
||||||
|
{ 0, "readahead" },
|
||||||
|
{ 0, "readdir" },
|
||||||
|
{ 0, "remap_file_pages" },
|
||||||
|
{ 0, "sched_get_priority_max" },
|
||||||
|
{ 0, "sched_get_priority_min" },
|
||||||
|
{ 0, "sched_getaffinity" },
|
||||||
|
{ 0, "sched_getattr" },
|
||||||
|
{ 0, "sched_getparam" },
|
||||||
|
{ 0, "sched_getscheduler" },
|
||||||
|
{ 0, "sched_rr_get_interval" },
|
||||||
|
{ 0, "sched_yield" },
|
||||||
|
{ 0, "seccomp" },
|
||||||
|
{ 0, "sendfile" },
|
||||||
|
{ 0, "sendfile64" },
|
||||||
|
{ 0, "setdomainname" },
|
||||||
|
{ 0, "setfsgid" },
|
||||||
|
{ 0, "setfsgid32" },
|
||||||
|
{ 0, "setfsuid" },
|
||||||
|
{ 0, "setfsuid32" },
|
||||||
|
{ 0, "sethostname" },
|
||||||
|
{ 0, "setpgid" },
|
||||||
|
{ 0, "setsid" },
|
||||||
|
{ 0, "splice" },
|
||||||
|
{ 0, "sync" },
|
||||||
|
{ 0, "sync_file_range" },
|
||||||
|
{ 0, "syncfs" },
|
||||||
|
{ 0, "sysinfo" },
|
||||||
|
{ 0, "tee" },
|
||||||
|
{ 0, "ugetrlimit" },
|
||||||
|
{ 0, "umask" },
|
||||||
|
{ 0, "uname" },
|
||||||
|
{ 0, "userfaultfd" },
|
||||||
|
{ 0, "vmsplice" },
|
||||||
|
|
||||||
|
/* The following individual syscalls are added depending on specified caps */
|
||||||
|
{ CAP_SYS_PACCT, "acct" },
|
||||||
|
{ CAP_SYS_PTRACE, "process_vm_readv" },
|
||||||
|
{ CAP_SYS_PTRACE, "process_vm_writev" },
|
||||||
|
{ CAP_SYS_PTRACE, "ptrace" },
|
||||||
|
{ CAP_SYS_BOOT, "reboot" },
|
||||||
|
{ CAP_SYSLOG, "syslog" },
|
||||||
|
{ CAP_SYS_TTY_CONFIG, "vhangup" },
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The following syscalls and groups are knowingly excluded:
|
||||||
|
*
|
||||||
|
* @cpu-emulation
|
||||||
|
* @keyring (NB: keyring is not namespaced!)
|
||||||
|
* @obsolete
|
||||||
|
* @swap
|
||||||
|
*
|
||||||
|
* bpf (NB: bpffs is not namespaced!)
|
||||||
|
* fanotify_init
|
||||||
|
* fanotify_mark
|
||||||
|
* kexec_file_load
|
||||||
|
* kexec_load
|
||||||
|
* lookup_dcookie
|
||||||
|
* nfsservctl
|
||||||
|
* open_by_handle_at
|
||||||
|
* perf_event_open
|
||||||
|
* pkey_alloc
|
||||||
|
* pkey_free
|
||||||
|
* pkey_mprotect
|
||||||
|
* quotactl
|
||||||
|
*/
|
||||||
};
|
};
|
||||||
|
|
||||||
int r, c = 0;
|
int r, c = 0;
|
||||||
size_t i;
|
size_t i;
|
||||||
char **p;
|
char **p;
|
||||||
|
|
||||||
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
|
for (i = 0; i < ELEMENTSOF(whitelist); i++) {
|
||||||
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
|
if (whitelist[i].capability != 0 && (cap_list_retain & (1ULL << whitelist[i].capability)) == 0)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
|
r = seccomp_add_syscall_filter_item(ctx, whitelist[i].name, SCMP_ACT_ALLOW, syscall_blacklist);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
|
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
|
||||||
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
|
log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", whitelist[i].name, seccomp_arch_to_string(arch));
|
||||||
else
|
else
|
||||||
c++;
|
c++;
|
||||||
}
|
}
|
||||||
|
|
||||||
STRV_FOREACH(p, syscall_blacklist) {
|
STRV_FOREACH(p, syscall_whitelist) {
|
||||||
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
|
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ALLOW, syscall_blacklist);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
|
log_debug_errno(r, "Failed to add rule for system call %s on %s, ignoring: %m", *p, seccomp_arch_to_string(arch));
|
||||||
else
|
else
|
||||||
c++;
|
c++;
|
||||||
}
|
}
|
||||||
@ -106,18 +213,33 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
|
|||||||
|
|
||||||
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
||||||
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
||||||
int n;
|
|
||||||
|
|
||||||
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
|
log_debug("Applying whitelist on architecture: %s", seccomp_arch_to_string(arch));
|
||||||
|
|
||||||
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ERRNO(EPERM));
|
||||||
|
if (r < 0)
|
||||||
|
return log_error_errno(r, "Failed to allocate seccomp object: %m");
|
||||||
|
|
||||||
|
r = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
|
||||||
|
if (r < 0)
|
||||||
|
return r;
|
||||||
|
|
||||||
|
r = seccomp_load(seccomp);
|
||||||
|
if (IN_SET(r, -EPERM, -EACCES))
|
||||||
|
return log_error_errno(r, "Failed to install seccomp filter: %m");
|
||||||
|
if (r < 0)
|
||||||
|
log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
|
||||||
|
}
|
||||||
|
|
||||||
|
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
|
||||||
|
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
|
||||||
|
|
||||||
|
log_debug("Applying NETLINK_AUDIT mask on architecture: %s", seccomp_arch_to_string(arch));
|
||||||
|
|
||||||
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
|
||||||
if (r < 0)
|
if (r < 0)
|
||||||
return log_error_errno(r, "Failed to allocate seccomp object: %m");
|
return log_error_errno(r, "Failed to allocate seccomp object: %m");
|
||||||
|
|
||||||
n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
|
|
||||||
if (n < 0)
|
|
||||||
return n;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
|
Audit is broken in containers, much of the userspace audit hookup will fail if running inside a
|
||||||
container. We don't care and just turn off creation of audit sockets.
|
container. We don't care and just turn off creation of audit sockets.
|
||||||
@ -133,13 +255,10 @@ int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **sys
|
|||||||
2,
|
2,
|
||||||
SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
|
SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
|
||||||
SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
|
SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
|
||||||
if (r < 0)
|
if (r < 0) {
|
||||||
log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
|
log_debug_errno(r, "Failed to add audit seccomp rule, ignoring: %m");
|
||||||
else
|
|
||||||
n++;
|
|
||||||
|
|
||||||
if (n <= 0) /* no rule added? then skip this architecture */
|
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
r = seccomp_load(seccomp);
|
r = seccomp_load(seccomp);
|
||||||
if (IN_SET(r, -EPERM, -EACCES))
|
if (IN_SET(r, -EPERM, -EACCES))
|
||||||
|
Loading…
Reference in New Issue
Block a user