mirror of
https://github.com/systemd/systemd.git
synced 2025-03-28 02:50:16 +03:00
nspawn: implement configurable syscall whitelisting/blacklisting
Now that we have ported nspawn's seccomp code to the generic code in seccomp-util, let's extend it to support whitelisting and blacklisting of specific additional syscalls. This uses similar syntax as PID1's support for system call filtering, but in contrast to that always implements a blacklist (and not a whitelist), as we prepopulate the filter with a blacklist, and the unit's system call filter logic does not come with anything prepopulated. (Later on we might actually want to invert the logic here, and whitelist rather than blacklist things, but at this point let's not do that. In case we switch this over later, the syscall add/remove logic of this commit should be compatible conceptually.) Fixes: #5163 Replaces: #5944
This commit is contained in:
parent
7609340e2f
commit
960e4569e1
@ -713,6 +713,23 @@
|
||||
above).</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--system-call-filter=</option></term>
|
||||
|
||||
<listitem><para>Alter the system call filter applied to containers. Takes a space-separated list of system call
|
||||
names or group names (the latter prefixed with <literal>@</literal>, as listed by the
|
||||
<command>syscall-filter</command> command of <citerefentry
|
||||
project='man-pages'><refentrytitle>systemd-analyze</refentrytitle><manvolnum>1</manvolnum></citerefentry>). Passed
|
||||
system calls will be permitted. The list may optionally be prefixed by <literal>~</literal>, in which case all
|
||||
listed system calls are prohibited. If this command line option is used multiple times the configured lists are
|
||||
combined. If both a positive and a negative list (that is one system call list without and one with the
|
||||
<literal>~</literal> prefix) are configured, the positive list takes precedence over the negative list. Note
|
||||
that <command>systemd-nspawn</command> always implements a system call blacklist (as opposed to a whitelist),
|
||||
and this command line option hence adds or removes entries from the default blacklist, depending on the
|
||||
<literal>~</literal> prefix. Note that the applied system call filter is also altered implicitly if additional
|
||||
capabilities are passed using the <command>--capabilities=</command>.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><option>--kill-signal=</option></term>
|
||||
|
||||
|
@ -274,11 +274,21 @@
|
||||
<varlistentry>
|
||||
<term><varname>NotifyReady=</varname></term>
|
||||
|
||||
<listitem><para>Configures support for notifications from the container's init process.
|
||||
This is equivalent to use <option>--notify-ready=</option> command line switch,
|
||||
and takes the same options. See <citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry>
|
||||
for details about the specific options supported.</para></listitem>
|
||||
<listitem><para>Configures support for notifications from the container's init process. This is equivalent to
|
||||
the <option>--notify-ready=</option> command line switch, and takes the same paramaters. See
|
||||
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for details
|
||||
about the specific options supported.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
<varlistentry>
|
||||
<term><varname>SystemCallFilter=</varname></term>
|
||||
|
||||
<listitem><para>Configures the system call filter applied to containers. This is equivalent to the
|
||||
<option>--system-call-filter=</option> command line switch, and takes the same list parameter. See
|
||||
<citerefentry><refentrytitle>systemd-nspawn</refentrytitle><manvolnum>1</manvolnum></citerefentry> for
|
||||
details.</para></listitem>
|
||||
</varlistentry>
|
||||
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
|
||||
|
@ -29,6 +29,7 @@ Exec.WorkingDirectory, config_parse_path, 0, offsetof(Settings,
|
||||
Exec.PivotRoot, config_parse_pivot_root, 0, 0
|
||||
Exec.PrivateUsers, config_parse_private_users, 0, 0
|
||||
Exec.NotifyReady, config_parse_bool, 0, offsetof(Settings, notify_ready)
|
||||
Exec.SystemCallFilter, config_parse_syscall_filter,0, 0,
|
||||
Files.ReadOnly, config_parse_tristate, 0, offsetof(Settings, read_only)
|
||||
Files.Volatile, config_parse_volatile_mode, 0, offsetof(Settings, volatile_mode)
|
||||
Files.Bind, config_parse_bind, 0, 0
|
||||
|
@ -33,13 +33,16 @@
|
||||
#include "seccomp-util.h"
|
||||
#endif
|
||||
#include "string-util.h"
|
||||
#include "strv.h"
|
||||
|
||||
#ifdef HAVE_SECCOMP
|
||||
|
||||
static int seccomp_add_default_syscall_filter(
|
||||
scmp_filter_ctx ctx,
|
||||
uint32_t arch,
|
||||
uint64_t cap_list_retain) {
|
||||
uint64_t cap_list_retain,
|
||||
char **syscall_whitelist,
|
||||
char **syscall_blacklist) {
|
||||
|
||||
static const struct {
|
||||
uint64_t capability;
|
||||
@ -67,12 +70,13 @@ static int seccomp_add_default_syscall_filter(
|
||||
|
||||
int r, c = 0;
|
||||
size_t i;
|
||||
char **p;
|
||||
|
||||
for (i = 0; i < ELEMENTSOF(blacklist); i++) {
|
||||
if (blacklist[i].capability != 0 && (cap_list_retain & (1ULL << blacklist[i].capability)))
|
||||
continue;
|
||||
|
||||
r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM));
|
||||
r = seccomp_add_syscall_filter_item(ctx, blacklist[i].name, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
|
||||
if (r < 0)
|
||||
/* If the system call is not known on this architecture, then that's fine, let's ignore it */
|
||||
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", blacklist[i].name);
|
||||
@ -80,15 +84,23 @@ static int seccomp_add_default_syscall_filter(
|
||||
c++;
|
||||
}
|
||||
|
||||
STRV_FOREACH(p, syscall_blacklist) {
|
||||
r = seccomp_add_syscall_filter_item(ctx, *p, SCMP_ACT_ERRNO(EPERM), syscall_whitelist);
|
||||
if (r < 0)
|
||||
log_debug_errno(r, "Failed to add rule for system call %s, ignoring: %m", *p);
|
||||
else
|
||||
c++;
|
||||
}
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
int setup_seccomp(uint64_t cap_list_retain) {
|
||||
int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
|
||||
uint32_t arch;
|
||||
int r;
|
||||
|
||||
if (!is_seccomp_available()) {
|
||||
log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP audit filter");
|
||||
log_debug("SECCOMP features not detected in the kernel, disabling SECCOMP filterering");
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -102,7 +114,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to allocate seccomp object: %m");
|
||||
|
||||
n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain);
|
||||
n = seccomp_add_default_syscall_filter(seccomp, arch, cap_list_retain, syscall_whitelist, syscall_blacklist);
|
||||
if (n < 0)
|
||||
return n;
|
||||
|
||||
@ -141,7 +153,7 @@ int setup_seccomp(uint64_t cap_list_retain) {
|
||||
|
||||
#else
|
||||
|
||||
int setup_seccomp(uint64_t cap_list_retain) {
|
||||
int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -21,4 +21,4 @@
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
int setup_seccomp(uint64_t cap_list_retain);
|
||||
int setup_seccomp(uint64_t cap_list_retain, char **syscall_whitelist, char **syscall_blacklist);
|
||||
|
@ -93,6 +93,8 @@ Settings* settings_free(Settings *s) {
|
||||
free(s->pivot_root_new);
|
||||
free(s->pivot_root_old);
|
||||
free(s->working_directory);
|
||||
strv_free(s->syscall_whitelist);
|
||||
strv_free(s->syscall_blacklist);
|
||||
|
||||
strv_free(s->network_interfaces);
|
||||
strv_free(s->network_macvlan);
|
||||
@ -568,3 +570,51 @@ int config_parse_private_users(
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int config_parse_syscall_filter(
|
||||
const char *unit,
|
||||
const char *filename,
|
||||
unsigned line,
|
||||
const char *section,
|
||||
unsigned section_line,
|
||||
const char *lvalue,
|
||||
int ltype,
|
||||
const char *rvalue,
|
||||
void *data,
|
||||
void *userdata) {
|
||||
|
||||
Settings *settings = data;
|
||||
bool negative;
|
||||
const char *items;
|
||||
int r;
|
||||
|
||||
assert(filename);
|
||||
assert(lvalue);
|
||||
assert(rvalue);
|
||||
|
||||
negative = rvalue[0] == '~';
|
||||
items = negative ? rvalue + 1 : rvalue;
|
||||
|
||||
for (;;) {
|
||||
_cleanup_free_ char *word = NULL;
|
||||
|
||||
r = extract_first_word(&items, &word, NULL, 0);
|
||||
if (r == 0)
|
||||
break;
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0) {
|
||||
log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse SystemCallFilter= parameter %s, ignoring: %m", rvalue);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (negative)
|
||||
r = strv_extend(&settings->syscall_blacklist, word);
|
||||
else
|
||||
r = strv_extend(&settings->syscall_whitelist, word);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -58,7 +58,8 @@ typedef enum SettingsMask {
|
||||
SETTING_USERNS = 1 << 13,
|
||||
SETTING_NOTIFY_READY = 1 << 14,
|
||||
SETTING_PIVOT_ROOT = 1 << 15,
|
||||
_SETTINGS_MASK_ALL = (1 << 16) -1
|
||||
SETTING_SYSCALL_FILTER = 1 << 16,
|
||||
_SETTINGS_MASK_ALL = (1 << 17) -1
|
||||
} SettingsMask;
|
||||
|
||||
typedef struct Settings {
|
||||
@ -78,6 +79,8 @@ typedef struct Settings {
|
||||
UserNamespaceMode userns_mode;
|
||||
uid_t uid_shift, uid_range;
|
||||
bool notify_ready;
|
||||
char **syscall_whitelist;
|
||||
char **syscall_blacklist;
|
||||
|
||||
/* [Image] */
|
||||
int read_only;
|
||||
@ -121,3 +124,4 @@ int config_parse_network_zone(const char *unit, const char *filename, unsigned l
|
||||
int config_parse_boot(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
int config_parse_pid2(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
int config_parse_private_users(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
int config_parse_syscall_filter(const char *unit, const char *filename, unsigned line, const char *section, unsigned section_line, const char *lvalue, int ltype, const char *rvalue, void *data, void *userdata);
|
||||
|
@ -208,6 +208,8 @@ static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS
|
||||
static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO;
|
||||
static void *arg_root_hash = NULL;
|
||||
static size_t arg_root_hash_size = 0;
|
||||
static char **arg_syscall_whitelist = NULL;
|
||||
static char **arg_syscall_blacklist = NULL;
|
||||
|
||||
static void help(void) {
|
||||
printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
|
||||
@ -267,6 +269,8 @@ static void help(void) {
|
||||
" --capability=CAP In addition to the default, retain specified\n"
|
||||
" capability\n"
|
||||
" --drop-capability=CAP Drop the specified capability from the default set\n"
|
||||
" --system-call-filter=LIST|~LIST\n"
|
||||
" Permit/prohibit specific system calls\n"
|
||||
" --kill-signal=SIGNAL Select signal to use for shutting down PID 1\n"
|
||||
" --link-journal=MODE Link up guest journal, one of no, auto, guest, \n"
|
||||
" host, try-guest, try-host\n"
|
||||
@ -431,6 +435,7 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
ARG_PRIVATE_USERS_CHOWN,
|
||||
ARG_NOTIFY_READY,
|
||||
ARG_ROOT_HASH,
|
||||
ARG_SYSTEM_CALL_FILTER,
|
||||
};
|
||||
|
||||
static const struct option options[] = {
|
||||
@ -482,6 +487,7 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
{ "pivot-root", required_argument, NULL, ARG_PIVOT_ROOT },
|
||||
{ "notify-ready", required_argument, NULL, ARG_NOTIFY_READY },
|
||||
{ "root-hash", required_argument, NULL, ARG_ROOT_HASH },
|
||||
{ "system-call-filter", required_argument, NULL, ARG_SYSTEM_CALL_FILTER },
|
||||
{}
|
||||
};
|
||||
|
||||
@ -1051,6 +1057,36 @@ static int parse_argv(int argc, char *argv[]) {
|
||||
break;
|
||||
}
|
||||
|
||||
case ARG_SYSTEM_CALL_FILTER: {
|
||||
bool negative;
|
||||
const char *items;
|
||||
|
||||
negative = optarg[0] == '~';
|
||||
items = negative ? optarg + 1 : optarg;
|
||||
|
||||
for (;;) {
|
||||
_cleanup_free_ char *word = NULL;
|
||||
|
||||
r = extract_first_word(&items, &word, NULL, 0);
|
||||
if (r == 0)
|
||||
break;
|
||||
if (r == -ENOMEM)
|
||||
return log_oom();
|
||||
if (r < 0)
|
||||
return log_error_errno(r, "Failed to parse system call filter: %m");
|
||||
|
||||
if (negative)
|
||||
r = strv_extend(&arg_syscall_blacklist, word);
|
||||
else
|
||||
r = strv_extend(&arg_syscall_whitelist, word);
|
||||
if (r < 0)
|
||||
return log_oom();
|
||||
}
|
||||
|
||||
arg_settings_mask |= SETTING_SYSCALL_FILTER;
|
||||
break;
|
||||
}
|
||||
|
||||
case '?':
|
||||
return -EINVAL;
|
||||
|
||||
@ -2606,7 +2642,7 @@ static int outer_child(
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = setup_seccomp(arg_caps_retain);
|
||||
r = setup_seccomp(arg_caps_retain, arg_syscall_whitelist, arg_syscall_blacklist);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
@ -3111,6 +3147,21 @@ static int load_settings(void) {
|
||||
if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0)
|
||||
arg_notify_ready = settings->notify_ready;
|
||||
|
||||
if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
|
||||
|
||||
if (!arg_settings_trusted && !strv_isempty(arg_syscall_whitelist))
|
||||
log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", p);
|
||||
else {
|
||||
strv_free(arg_syscall_whitelist);
|
||||
strv_free(arg_syscall_blacklist);
|
||||
|
||||
arg_syscall_whitelist = settings->syscall_whitelist;
|
||||
arg_syscall_blacklist = settings->syscall_blacklist;
|
||||
|
||||
settings->syscall_whitelist = settings->syscall_blacklist = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -682,14 +682,17 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action);
|
||||
static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
|
||||
|
||||
int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action) {
|
||||
int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
|
||||
int r;
|
||||
|
||||
assert(seccomp);
|
||||
assert(name);
|
||||
|
||||
if (strv_contains(exclude, name))
|
||||
return 0;
|
||||
|
||||
if (name[0] == '@') {
|
||||
const SyscallFilterSet *other;
|
||||
|
||||
@ -697,7 +700,7 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
|
||||
if (!other)
|
||||
return -EINVAL;
|
||||
|
||||
r = seccomp_add_syscall_filter_set(seccomp, other, action);
|
||||
r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
|
||||
if (r < 0)
|
||||
return r;
|
||||
} else {
|
||||
@ -719,7 +722,8 @@ int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name,
|
||||
static int seccomp_add_syscall_filter_set(
|
||||
scmp_filter_ctx seccomp,
|
||||
const SyscallFilterSet *set,
|
||||
uint32_t action) {
|
||||
uint32_t action,
|
||||
char **exclude) {
|
||||
|
||||
const char *sys;
|
||||
int r;
|
||||
@ -728,7 +732,7 @@ static int seccomp_add_syscall_filter_set(
|
||||
assert(set);
|
||||
|
||||
NULSTR_FOREACH(sys, set->value) {
|
||||
r = seccomp_add_syscall_filter_item(seccomp, sys, action);
|
||||
r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
@ -754,7 +758,7 @@ int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilter
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = seccomp_add_syscall_filter_set(seccomp, set, action);
|
||||
r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
|
||||
if (r < 0) {
|
||||
log_debug_errno(r, "Failed to add filter set, ignoring: %m");
|
||||
continue;
|
||||
|
@ -69,7 +69,7 @@ const SyscallFilterSet *syscall_filter_set_find(const char *name);
|
||||
|
||||
int seccomp_filter_set_add(Set *s, bool b, const SyscallFilterSet *set);
|
||||
|
||||
int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action);
|
||||
int seccomp_add_syscall_filter_item(scmp_filter_ctx *ctx, const char *name, uint32_t action, char **exclude);
|
||||
|
||||
int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action);
|
||||
int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action);
|
||||
|
Loading…
x
Reference in New Issue
Block a user