mirror of
https://github.com/systemd/systemd-stable.git
synced 2025-01-05 09:17:44 +03:00
Merge pull request #4483 from poettering/exec-order
more seccomp fixes, and change of order of selinux/aa/smack and seccomp application on exec
This commit is contained in:
commit
32e134c19f
5
TODO
5
TODO
@ -32,6 +32,11 @@ Janitorial Clean-ups:
|
||||
|
||||
Features:
|
||||
|
||||
* drop nss-myhostname in favour of nss-resolve?
|
||||
|
||||
* drop internal dlopen() based nss-dns fallback in nss-resolve, and rely on the
|
||||
external nsswitch.conf based one
|
||||
|
||||
* add a percentage syntax for TimeoutStopSec=, e.g. TimeoutStopSec=150%, and
|
||||
then use that for the setting used in user@.service. It should be understood
|
||||
relative to the configured default value.
|
||||
|
@ -1255,30 +1255,28 @@
|
||||
<varlistentry>
|
||||
<term><varname>SystemCallFilter=</varname></term>
|
||||
|
||||
<listitem><para>Takes a space-separated list of system call
|
||||
names. If this setting is used, all system calls executed by
|
||||
the unit processes except for the listed ones will result in
|
||||
immediate process termination with the
|
||||
<constant>SIGSYS</constant> signal (whitelisting). If the
|
||||
first character of the list is <literal>~</literal>, the
|
||||
effect is inverted: only the listed system calls will result
|
||||
in immediate process termination (blacklisting). If running in
|
||||
user mode, or in system mode, but without the
|
||||
<constant>CAP_SYS_ADMIN</constant> capability (e.g. setting
|
||||
<varname>User=nobody</varname>),
|
||||
<varname>NoNewPrivileges=yes</varname> is implied. This
|
||||
feature makes use of the Secure Computing Mode 2 interfaces of
|
||||
the kernel ('seccomp filtering') and is useful for enforcing a
|
||||
minimal sandboxing environment. Note that the
|
||||
<function>execve</function>,
|
||||
<function>rt_sigreturn</function>,
|
||||
<function>sigreturn</function>,
|
||||
<function>exit_group</function>, <function>exit</function>
|
||||
system calls are implicitly whitelisted and do not need to be
|
||||
listed explicitly. This option may be specified more than once,
|
||||
in which case the filter masks are merged. If the empty string
|
||||
is assigned, the filter is reset, all prior assignments will
|
||||
have no effect. This does not affect commands prefixed with <literal>+</literal>.</para>
|
||||
<listitem><para>Takes a space-separated list of system call names. If this setting is used, all system calls
|
||||
executed by the unit processes except for the listed ones will result in immediate process termination with the
|
||||
<constant>SIGSYS</constant> signal (whitelisting). If the first character of the list is <literal>~</literal>,
|
||||
the effect is inverted: only the listed system calls will result in immediate process termination
|
||||
(blacklisting). If running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant>
|
||||
capability (e.g. setting <varname>User=nobody</varname>), <varname>NoNewPrivileges=yes</varname> is
|
||||
implied. This feature makes use of the Secure Computing Mode 2 interfaces of the kernel ('seccomp filtering')
|
||||
and is useful for enforcing a minimal sandboxing environment. Note that the <function>execve</function>,
|
||||
<function>exit</function>, <function>exit_group</function>, <function>getrlimit</function>,
|
||||
<function>rt_sigreturn</function>, <function>sigreturn</function> system calls and the system calls for
|
||||
querying time and sleeping are implicitly whitelisted and do not need to be listed explicitly. This option may
|
||||
be specified more than once, in which case the filter masks are merged. If the empty string is assigned, the
|
||||
filter is reset, all prior assignments will have no effect. This does not affect commands prefixed with
|
||||
<literal>+</literal>.</para>
|
||||
|
||||
<para>Note that strict system call filters may impact execution and error handling code paths of the service
|
||||
invocation. Specifically, access to the <function>execve</function> system call is required for the execution
|
||||
of the service binary — if it is blocked service invocation will necessarily fail. Also, if execution of the
|
||||
service binary fails for some reason (for example: missing service executable), the error handling logic might
|
||||
require access to an additional set of system calls in order to process and log this failure correctly. It
|
||||
might be necessary to temporarily disable system call filters in order to simplify debugging of such
|
||||
failures.</para>
|
||||
|
||||
<para>If you specify both types of this option (i.e.
|
||||
whitelisting and blacklisting), the first encountered will
|
||||
@ -1311,6 +1309,10 @@
|
||||
</row>
|
||||
</thead>
|
||||
<tbody>
|
||||
<row>
|
||||
<entry>@basic-io</entry>
|
||||
<entry>System calls for basic I/O: reading, writing, seeking, file descriptor duplication and closing (<citerefentry project='man-pages'><refentrytitle>read</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>write</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>@clock</entry>
|
||||
<entry>System calls for changing the system clock (<citerefentry project='man-pages'><refentrytitle>adjtimex</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>settimeofday</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry>
|
||||
@ -1329,7 +1331,7 @@
|
||||
</row>
|
||||
<row>
|
||||
<entry>@ipc</entry>
|
||||
<entry>SysV IPC, POSIX Message Queues or other IPC (<citerefentry project='man-pages'><refentrytitle>mq_overview</refentrytitle><manvolnum>7</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>svipc</refentrytitle><manvolnum>7</manvolnum></citerefentry>)</entry>
|
||||
<entry>Pipes, SysV IPC, POSIX Message Queues and other IPC (<citerefentry project='man-pages'><refentrytitle>mq_overview</refentrytitle><manvolnum>7</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>svipc</refentrytitle><manvolnum>7</manvolnum></citerefentry>)</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>@keyring</entry>
|
||||
@ -1357,17 +1359,21 @@
|
||||
</row>
|
||||
<row>
|
||||
<entry>@process</entry>
|
||||
<entry>Process control, execution, namespaces (<citerefentry project='man-pages'><refentrytitle>execve</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>kill</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>, …</entry>
|
||||
<entry>Process control, execution, namespaces (<citerefentry project='man-pages'><refentrytitle>clone</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>kill</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>, …</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>@raw-io</entry>
|
||||
<entry>Raw I/O port access (<citerefentry project='man-pages'><refentrytitle>ioperm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>iopl</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>pciconfig_read()</function>, …</entry>
|
||||
<entry>Raw I/O port access (<citerefentry project='man-pages'><refentrytitle>ioperm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>iopl</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>pciconfig_read()</function>, …)</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>@resources</entry>
|
||||
<entry>System calls for changing resource limits, memory and scheduling parameters (<citerefentry project='man-pages'><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>setpriority</refentrytitle><manvolnum>2</manvolnum></citerefentry>, …)</entry>
|
||||
</row>
|
||||
</tbody>
|
||||
</tgroup>
|
||||
</table>
|
||||
|
||||
Note, that as new system calls are added to the kernel, additional system calls might be added to the groups
|
||||
Note that as new system calls are added to the kernel, additional system calls might be added to the groups
|
||||
above, so the contents of the sets may change between systemd versions.</para>
|
||||
|
||||
<para>It is recommended to combine the file system namespacing related options with
|
||||
|
@ -2540,12 +2540,6 @@ static int exec_child(
|
||||
(void) umask(context->umask);
|
||||
|
||||
if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
|
||||
r = setup_smack(context, command);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SMACK_PROCESS_LABEL;
|
||||
return r;
|
||||
}
|
||||
|
||||
if (context->pam_name && username) {
|
||||
r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
|
||||
if (r < 0) {
|
||||
@ -2695,6 +2689,41 @@ static int exec_child(
|
||||
}
|
||||
}
|
||||
|
||||
/* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
|
||||
* influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
|
||||
* syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
|
||||
* are restricted. */
|
||||
|
||||
#ifdef HAVE_SELINUX
|
||||
if (mac_selinux_use()) {
|
||||
char *exec_context = mac_selinux_context_net ?: context->selinux_context;
|
||||
|
||||
if (exec_context) {
|
||||
r = setexeccon(exec_context);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SELINUX_CONTEXT;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
r = setup_smack(context, command);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SMACK_PROCESS_LABEL;
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef HAVE_APPARMOR
|
||||
if (context->apparmor_profile && mac_apparmor_use()) {
|
||||
r = aa_change_onexec(context->apparmor_profile);
|
||||
if (r < 0 && !context->apparmor_profile_ignore) {
|
||||
*exit_status = EXIT_APPARMOR_PROFILE;
|
||||
return -errno;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* PR_GET_SECUREBITS is not privileged, while
|
||||
* PR_SET_SECUREBITS is. So to suppress
|
||||
* potential EPERMs we'll try not to call
|
||||
@ -2760,6 +2789,8 @@ static int exec_child(
|
||||
}
|
||||
}
|
||||
|
||||
/* This really should remain the last step before the execve(), to make sure our own code is unaffected
|
||||
* by the filter as little as possible. */
|
||||
if (context_has_syscall_filters(context)) {
|
||||
r = apply_seccomp(unit, context);
|
||||
if (r < 0) {
|
||||
@ -2768,30 +2799,6 @@ static int exec_child(
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SELINUX
|
||||
if (mac_selinux_use()) {
|
||||
char *exec_context = mac_selinux_context_net ?: context->selinux_context;
|
||||
|
||||
if (exec_context) {
|
||||
r = setexeccon(exec_context);
|
||||
if (r < 0) {
|
||||
*exit_status = EXIT_SELINUX_CONTEXT;
|
||||
return r;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_APPARMOR
|
||||
if (context->apparmor_profile && mac_apparmor_use()) {
|
||||
r = aa_change_onexec(context->apparmor_profile);
|
||||
if (r < 0 && !context->apparmor_profile_ignore) {
|
||||
*exit_status = EXIT_APPARMOR_PROFILE;
|
||||
return -errno;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
final_argv = replace_env_argv(argv, accum_env);
|
||||
@ -3613,7 +3620,8 @@ char *exec_command_line(char **argv) {
|
||||
STRV_FOREACH(a, argv)
|
||||
k += strlen(*a)+3;
|
||||
|
||||
if (!(n = new(char, k)))
|
||||
n = new(char, k);
|
||||
if (!n)
|
||||
return NULL;
|
||||
|
||||
p = n;
|
||||
|
@ -217,6 +217,24 @@ bool is_seccomp_available(void) {
|
||||
}
|
||||
|
||||
const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
||||
[SYSCALL_FILTER_SET_BASIC_IO] = {
|
||||
/* Basic IO */
|
||||
.name = "@basic-io",
|
||||
.value =
|
||||
"close\0"
|
||||
"dup2\0"
|
||||
"dup3\0"
|
||||
"dup\0"
|
||||
"lseek\0"
|
||||
"pread64\0"
|
||||
"preadv\0"
|
||||
"pwrite64\0"
|
||||
"pwritev\0"
|
||||
"read\0"
|
||||
"readv\0"
|
||||
"write\0"
|
||||
"writev\0"
|
||||
},
|
||||
[SYSCALL_FILTER_SET_CLOCK] = {
|
||||
/* Clock */
|
||||
.name = "@clock",
|
||||
@ -253,15 +271,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
||||
"sys_debug_setcontext\0"
|
||||
},
|
||||
[SYSCALL_FILTER_SET_DEFAULT] = {
|
||||
/* Default list */
|
||||
/* Default list: the most basic of operations */
|
||||
.name = "@default",
|
||||
.value =
|
||||
"clock_getres\0"
|
||||
"clock_gettime\0"
|
||||
"clock_nanosleep\0"
|
||||
"execve\0"
|
||||
"exit\0"
|
||||
"exit_group\0"
|
||||
"getrlimit\0" /* make sure processes can query stack size and such */
|
||||
"gettimeofday\0"
|
||||
"nanosleep\0"
|
||||
"pause\0"
|
||||
"rt_sigreturn\0"
|
||||
"sigreturn\0"
|
||||
"time\0"
|
||||
},
|
||||
[SYSCALL_FILTER_SET_IO_EVENT] = {
|
||||
/* Event loop use */
|
||||
@ -283,9 +308,10 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
||||
"select\0"
|
||||
},
|
||||
[SYSCALL_FILTER_SET_IPC] = {
|
||||
/* Message queues, SYSV IPC or other IPC: unusual */
|
||||
/* Message queues, SYSV IPC or other IPC */
|
||||
.name = "@ipc",
|
||||
.value = "ipc\0"
|
||||
"memfd_create\0"
|
||||
"mq_getsetattr\0"
|
||||
"mq_notify\0"
|
||||
"mq_open\0"
|
||||
@ -296,6 +322,8 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
||||
"msgget\0"
|
||||
"msgrcv\0"
|
||||
"msgsnd\0"
|
||||
"pipe2\0"
|
||||
"pipe\0"
|
||||
"process_vm_readv\0"
|
||||
"process_vm_writev\0"
|
||||
"semctl\0"
|
||||
@ -436,7 +464,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
||||
.value =
|
||||
"arch_prctl\0"
|
||||
"clone\0"
|
||||
"execve\0"
|
||||
"execveat\0"
|
||||
"fork\0"
|
||||
"kill\0"
|
||||
@ -463,6 +490,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
|
||||
"s390_pci_mmio_write\0"
|
||||
#endif
|
||||
},
|
||||
[SYSCALL_FILTER_SET_RESOURCES] = {
|
||||
/* Alter resource settings */
|
||||
.name = "@resources",
|
||||
.value =
|
||||
"sched_setparam\0"
|
||||
"sched_setscheduler\0"
|
||||
"sched_setaffinity\0"
|
||||
"setpriority\0"
|
||||
"setrlimit\0"
|
||||
"set_mempolicy\0"
|
||||
"migrate_pages\0"
|
||||
"move_pages\0"
|
||||
"mbind\0"
|
||||
"sched_setattr\0"
|
||||
"prlimit64\0"
|
||||
},
|
||||
};
|
||||
|
||||
const SyscallFilterSet *syscall_filter_set_find(const char *name) {
|
||||
|
@ -38,6 +38,7 @@ typedef struct SyscallFilterSet {
|
||||
} SyscallFilterSet;
|
||||
|
||||
enum {
|
||||
SYSCALL_FILTER_SET_BASIC_IO,
|
||||
SYSCALL_FILTER_SET_CLOCK,
|
||||
SYSCALL_FILTER_SET_CPU_EMULATION,
|
||||
SYSCALL_FILTER_SET_DEBUG,
|
||||
@ -52,6 +53,7 @@ enum {
|
||||
SYSCALL_FILTER_SET_PRIVILEGED,
|
||||
SYSCALL_FILTER_SET_PROCESS,
|
||||
SYSCALL_FILTER_SET_RAW_IO,
|
||||
SYSCALL_FILTER_SET_RESOURCES,
|
||||
_SYSCALL_FILTER_SET_MAX
|
||||
};
|
||||
|
||||
|
@ -589,7 +589,7 @@ static void test_install_printf(void) {
|
||||
assert_se(specifier_machine_id('m', NULL, NULL, &mid) >= 0 && mid);
|
||||
assert_se(specifier_boot_id('b', NULL, NULL, &bid) >= 0 && bid);
|
||||
assert_se((host = gethostname_malloc()));
|
||||
assert_se((user = getusername_malloc()));
|
||||
assert_se((user = uid_to_name(getuid())));
|
||||
assert_se(asprintf(&uid, UID_FMT, getuid()) >= 0);
|
||||
|
||||
#define expect(src, pattern, result) \
|
||||
|
Loading…
Reference in New Issue
Block a user