1
1
mirror of https://github.com/systemd/systemd-stable.git synced 2025-01-06 13:17:44 +03:00

Merge pull request #4483 from poettering/exec-order

more seccomp fixes, and change of order of selinux/aa/smack and seccomp application on exec
This commit is contained in:
Lennart Poettering 2016-11-02 16:09:59 -06:00 committed by GitHub
commit 32e134c19f
6 changed files with 127 additions and 63 deletions

5
TODO
View File

@ -32,6 +32,11 @@ Janitorial Clean-ups:
Features: Features:
* drop nss-myhostname in favour of nss-resolve?
* drop internal dlopen() based nss-dns fallback in nss-resolve, and rely on the
external nsswitch.conf based one
* add a percentage syntax for TimeoutStopSec=, e.g. TimeoutStopSec=150%, and * add a percentage syntax for TimeoutStopSec=, e.g. TimeoutStopSec=150%, and
then use that for the setting used in user@.service. It should be understood then use that for the setting used in user@.service. It should be understood
relative to the configured default value. relative to the configured default value.

View File

@ -1255,30 +1255,28 @@
<varlistentry> <varlistentry>
<term><varname>SystemCallFilter=</varname></term> <term><varname>SystemCallFilter=</varname></term>
<listitem><para>Takes a space-separated list of system call <listitem><para>Takes a space-separated list of system call names. If this setting is used, all system calls
names. If this setting is used, all system calls executed by executed by the unit processes except for the listed ones will result in immediate process termination with the
the unit processes except for the listed ones will result in <constant>SIGSYS</constant> signal (whitelisting). If the first character of the list is <literal>~</literal>,
immediate process termination with the the effect is inverted: only the listed system calls will result in immediate process termination
<constant>SIGSYS</constant> signal (whitelisting). If the (blacklisting). If running in user mode, or in system mode, but without the <constant>CAP_SYS_ADMIN</constant>
first character of the list is <literal>~</literal>, the capability (e.g. setting <varname>User=nobody</varname>), <varname>NoNewPrivileges=yes</varname> is
effect is inverted: only the listed system calls will result implied. This feature makes use of the Secure Computing Mode 2 interfaces of the kernel ('seccomp filtering')
in immediate process termination (blacklisting). If running in and is useful for enforcing a minimal sandboxing environment. Note that the <function>execve</function>,
user mode, or in system mode, but without the <function>exit</function>, <function>exit_group</function>, <function>getrlimit</function>,
<constant>CAP_SYS_ADMIN</constant> capability (e.g. setting <function>rt_sigreturn</function>, <function>sigreturn</function> system calls and the system calls for
<varname>User=nobody</varname>), querying time and sleeping are implicitly whitelisted and do not need to be listed explicitly. This option may
<varname>NoNewPrivileges=yes</varname> is implied. This be specified more than once, in which case the filter masks are merged. If the empty string is assigned, the
feature makes use of the Secure Computing Mode 2 interfaces of filter is reset, all prior assignments will have no effect. This does not affect commands prefixed with
the kernel ('seccomp filtering') and is useful for enforcing a <literal>+</literal>.</para>
minimal sandboxing environment. Note that the
<function>execve</function>, <para>Note that strict system call filters may impact execution and error handling code paths of the service
<function>rt_sigreturn</function>, invocation. Specifically, access to the <function>execve</function> system call is required for the execution
<function>sigreturn</function>, of the service binary — if it is blocked service invocation will necessarily fail. Also, if execution of the
<function>exit_group</function>, <function>exit</function> service binary fails for some reason (for example: missing service executable), the error handling logic might
system calls are implicitly whitelisted and do not need to be require access to an additional set of system calls in order to process and log this failure correctly. It
listed explicitly. This option may be specified more than once, might be necessary to temporarily disable system call filters in order to simplify debugging of such
in which case the filter masks are merged. If the empty string failures.</para>
is assigned, the filter is reset, all prior assignments will
have no effect. This does not affect commands prefixed with <literal>+</literal>.</para>
<para>If you specify both types of this option (i.e. <para>If you specify both types of this option (i.e.
whitelisting and blacklisting), the first encountered will whitelisting and blacklisting), the first encountered will
@ -1311,6 +1309,10 @@
</row> </row>
</thead> </thead>
<tbody> <tbody>
<row>
<entry>@basic-io</entry>
<entry>System calls for basic I/O: reading, writing, seeking, file descriptor duplication and closing (<citerefentry project='man-pages'><refentrytitle>read</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>write</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry>
</row>
<row> <row>
<entry>@clock</entry> <entry>@clock</entry>
<entry>System calls for changing the system clock (<citerefentry project='man-pages'><refentrytitle>adjtimex</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>settimeofday</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry> <entry>System calls for changing the system clock (<citerefentry project='man-pages'><refentrytitle>adjtimex</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>settimeofday</refentrytitle><manvolnum>2</manvolnum></citerefentry>, and related calls)</entry>
@ -1329,7 +1331,7 @@
</row> </row>
<row> <row>
<entry>@ipc</entry> <entry>@ipc</entry>
<entry>SysV IPC, POSIX Message Queues or other IPC (<citerefentry project='man-pages'><refentrytitle>mq_overview</refentrytitle><manvolnum>7</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>svipc</refentrytitle><manvolnum>7</manvolnum></citerefentry>)</entry> <entry>Pipes, SysV IPC, POSIX Message Queues and other IPC (<citerefentry project='man-pages'><refentrytitle>mq_overview</refentrytitle><manvolnum>7</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>svipc</refentrytitle><manvolnum>7</manvolnum></citerefentry>)</entry>
</row> </row>
<row> <row>
<entry>@keyring</entry> <entry>@keyring</entry>
@ -1357,17 +1359,21 @@
</row> </row>
<row> <row>
<entry>@process</entry> <entry>@process</entry>
<entry>Process control, execution, namespaces (<citerefentry project='man-pages'><refentrytitle>execve</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>kill</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>, …</entry> <entry>Process control, execution, namespaces (<citerefentry project='man-pages'><refentrytitle>clone</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>kill</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>namespaces</refentrytitle><manvolnum>7</manvolnum></citerefentry>, …</entry>
</row> </row>
<row> <row>
<entry>@raw-io</entry> <entry>@raw-io</entry>
<entry>Raw I/O port access (<citerefentry project='man-pages'><refentrytitle>ioperm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>iopl</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>pciconfig_read()</function>, …</entry> <entry>Raw I/O port access (<citerefentry project='man-pages'><refentrytitle>ioperm</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>iopl</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <function>pciconfig_read()</function>, …)</entry>
</row>
<row>
<entry>@resources</entry>
<entry>System calls for changing resource limits, memory and scheduling parameters (<citerefentry project='man-pages'><refentrytitle>setrlimit</refentrytitle><manvolnum>2</manvolnum></citerefentry>, <citerefentry project='man-pages'><refentrytitle>setpriority</refentrytitle><manvolnum>2</manvolnum></citerefentry>, …)</entry>
</row> </row>
</tbody> </tbody>
</tgroup> </tgroup>
</table> </table>
Note, that as new system calls are added to the kernel, additional system calls might be added to the groups Note that as new system calls are added to the kernel, additional system calls might be added to the groups
above, so the contents of the sets may change between systemd versions.</para> above, so the contents of the sets may change between systemd versions.</para>
<para>It is recommended to combine the file system namespacing related options with <para>It is recommended to combine the file system namespacing related options with

View File

@ -2540,12 +2540,6 @@ static int exec_child(
(void) umask(context->umask); (void) umask(context->umask);
if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) { if ((params->flags & EXEC_APPLY_PERMISSIONS) && !command->privileged) {
r = setup_smack(context, command);
if (r < 0) {
*exit_status = EXIT_SMACK_PROCESS_LABEL;
return r;
}
if (context->pam_name && username) { if (context->pam_name && username) {
r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds); r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
if (r < 0) { if (r < 0) {
@ -2695,6 +2689,41 @@ static int exec_child(
} }
} }
/* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
* influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
* syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
* are restricted. */
#ifdef HAVE_SELINUX
if (mac_selinux_use()) {
char *exec_context = mac_selinux_context_net ?: context->selinux_context;
if (exec_context) {
r = setexeccon(exec_context);
if (r < 0) {
*exit_status = EXIT_SELINUX_CONTEXT;
return r;
}
}
}
#endif
r = setup_smack(context, command);
if (r < 0) {
*exit_status = EXIT_SMACK_PROCESS_LABEL;
return r;
}
#ifdef HAVE_APPARMOR
if (context->apparmor_profile && mac_apparmor_use()) {
r = aa_change_onexec(context->apparmor_profile);
if (r < 0 && !context->apparmor_profile_ignore) {
*exit_status = EXIT_APPARMOR_PROFILE;
return -errno;
}
}
#endif
/* PR_GET_SECUREBITS is not privileged, while /* PR_GET_SECUREBITS is not privileged, while
* PR_SET_SECUREBITS is. So to suppress * PR_SET_SECUREBITS is. So to suppress
* potential EPERMs we'll try not to call * potential EPERMs we'll try not to call
@ -2760,6 +2789,8 @@ static int exec_child(
} }
} }
/* This really should remain the last step before the execve(), to make sure our own code is unaffected
* by the filter as little as possible. */
if (context_has_syscall_filters(context)) { if (context_has_syscall_filters(context)) {
r = apply_seccomp(unit, context); r = apply_seccomp(unit, context);
if (r < 0) { if (r < 0) {
@ -2768,30 +2799,6 @@ static int exec_child(
} }
} }
#endif #endif
#ifdef HAVE_SELINUX
if (mac_selinux_use()) {
char *exec_context = mac_selinux_context_net ?: context->selinux_context;
if (exec_context) {
r = setexeccon(exec_context);
if (r < 0) {
*exit_status = EXIT_SELINUX_CONTEXT;
return r;
}
}
}
#endif
#ifdef HAVE_APPARMOR
if (context->apparmor_profile && mac_apparmor_use()) {
r = aa_change_onexec(context->apparmor_profile);
if (r < 0 && !context->apparmor_profile_ignore) {
*exit_status = EXIT_APPARMOR_PROFILE;
return -errno;
}
}
#endif
} }
final_argv = replace_env_argv(argv, accum_env); final_argv = replace_env_argv(argv, accum_env);
@ -3613,7 +3620,8 @@ char *exec_command_line(char **argv) {
STRV_FOREACH(a, argv) STRV_FOREACH(a, argv)
k += strlen(*a)+3; k += strlen(*a)+3;
if (!(n = new(char, k))) n = new(char, k);
if (!n)
return NULL; return NULL;
p = n; p = n;

View File

@ -217,6 +217,24 @@ bool is_seccomp_available(void) {
} }
const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = { const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
[SYSCALL_FILTER_SET_BASIC_IO] = {
/* Basic IO */
.name = "@basic-io",
.value =
"close\0"
"dup2\0"
"dup3\0"
"dup\0"
"lseek\0"
"pread64\0"
"preadv\0"
"pwrite64\0"
"pwritev\0"
"read\0"
"readv\0"
"write\0"
"writev\0"
},
[SYSCALL_FILTER_SET_CLOCK] = { [SYSCALL_FILTER_SET_CLOCK] = {
/* Clock */ /* Clock */
.name = "@clock", .name = "@clock",
@ -253,15 +271,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
"sys_debug_setcontext\0" "sys_debug_setcontext\0"
}, },
[SYSCALL_FILTER_SET_DEFAULT] = { [SYSCALL_FILTER_SET_DEFAULT] = {
/* Default list */ /* Default list: the most basic of operations */
.name = "@default", .name = "@default",
.value = .value =
"clock_getres\0"
"clock_gettime\0"
"clock_nanosleep\0"
"execve\0" "execve\0"
"exit\0" "exit\0"
"exit_group\0" "exit_group\0"
"getrlimit\0" /* make sure processes can query stack size and such */ "getrlimit\0" /* make sure processes can query stack size and such */
"gettimeofday\0"
"nanosleep\0"
"pause\0"
"rt_sigreturn\0" "rt_sigreturn\0"
"sigreturn\0" "sigreturn\0"
"time\0"
}, },
[SYSCALL_FILTER_SET_IO_EVENT] = { [SYSCALL_FILTER_SET_IO_EVENT] = {
/* Event loop use */ /* Event loop use */
@ -283,9 +308,10 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
"select\0" "select\0"
}, },
[SYSCALL_FILTER_SET_IPC] = { [SYSCALL_FILTER_SET_IPC] = {
/* Message queues, SYSV IPC or other IPC: unusual */ /* Message queues, SYSV IPC or other IPC */
.name = "@ipc", .name = "@ipc",
.value = "ipc\0" .value = "ipc\0"
"memfd_create\0"
"mq_getsetattr\0" "mq_getsetattr\0"
"mq_notify\0" "mq_notify\0"
"mq_open\0" "mq_open\0"
@ -296,6 +322,8 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
"msgget\0" "msgget\0"
"msgrcv\0" "msgrcv\0"
"msgsnd\0" "msgsnd\0"
"pipe2\0"
"pipe\0"
"process_vm_readv\0" "process_vm_readv\0"
"process_vm_writev\0" "process_vm_writev\0"
"semctl\0" "semctl\0"
@ -436,7 +464,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
.value = .value =
"arch_prctl\0" "arch_prctl\0"
"clone\0" "clone\0"
"execve\0"
"execveat\0" "execveat\0"
"fork\0" "fork\0"
"kill\0" "kill\0"
@ -463,6 +490,22 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
"s390_pci_mmio_write\0" "s390_pci_mmio_write\0"
#endif #endif
}, },
[SYSCALL_FILTER_SET_RESOURCES] = {
/* Alter resource settings */
.name = "@resources",
.value =
"sched_setparam\0"
"sched_setscheduler\0"
"sched_setaffinity\0"
"setpriority\0"
"setrlimit\0"
"set_mempolicy\0"
"migrate_pages\0"
"move_pages\0"
"mbind\0"
"sched_setattr\0"
"prlimit64\0"
},
}; };
const SyscallFilterSet *syscall_filter_set_find(const char *name) { const SyscallFilterSet *syscall_filter_set_find(const char *name) {

View File

@ -38,6 +38,7 @@ typedef struct SyscallFilterSet {
} SyscallFilterSet; } SyscallFilterSet;
enum { enum {
SYSCALL_FILTER_SET_BASIC_IO,
SYSCALL_FILTER_SET_CLOCK, SYSCALL_FILTER_SET_CLOCK,
SYSCALL_FILTER_SET_CPU_EMULATION, SYSCALL_FILTER_SET_CPU_EMULATION,
SYSCALL_FILTER_SET_DEBUG, SYSCALL_FILTER_SET_DEBUG,
@ -52,6 +53,7 @@ enum {
SYSCALL_FILTER_SET_PRIVILEGED, SYSCALL_FILTER_SET_PRIVILEGED,
SYSCALL_FILTER_SET_PROCESS, SYSCALL_FILTER_SET_PROCESS,
SYSCALL_FILTER_SET_RAW_IO, SYSCALL_FILTER_SET_RAW_IO,
SYSCALL_FILTER_SET_RESOURCES,
_SYSCALL_FILTER_SET_MAX _SYSCALL_FILTER_SET_MAX
}; };

View File

@ -589,7 +589,7 @@ static void test_install_printf(void) {
assert_se(specifier_machine_id('m', NULL, NULL, &mid) >= 0 && mid); assert_se(specifier_machine_id('m', NULL, NULL, &mid) >= 0 && mid);
assert_se(specifier_boot_id('b', NULL, NULL, &bid) >= 0 && bid); assert_se(specifier_boot_id('b', NULL, NULL, &bid) >= 0 && bid);
assert_se((host = gethostname_malloc())); assert_se((host = gethostname_malloc()));
assert_se((user = getusername_malloc())); assert_se((user = uid_to_name(getuid())));
assert_se(asprintf(&uid, UID_FMT, getuid()) >= 0); assert_se(asprintf(&uid, UID_FMT, getuid()) >= 0);
#define expect(src, pattern, result) \ #define expect(src, pattern, result) \