c2aa2dfef2
This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) that makes it so that when notifications are received by the supervisor the notifying process will transition to wait killable semantics. Although wait killable isn't a set of semantics formally exposed to userspace, the concept is searchable. If the notifying process is signaled prior to the notification being received by the userspace agent, it will be handled as normal. One quirk about how this is handled is that the notifying process only switches to TASK_KILLABLE if it receives a wakeup from either an addfd or a signal. This is to avoid an unnecessary wakeup of the notifying task. The reasons behind switching into wait_killable only after userspace receives the notification are: * Avoiding unncessary work - Often, workloads will perform work that they may abort (request racing comes to mind). This allows for syscalls to be aborted safely prior to the notification being received by the supervisor. In this, the supervisor doesn't end up doing work that the workload does not want to complete anyways. * Avoiding side effects - We don't want the syscall to be interruptible once the supervisor starts doing work because it may not be trivial to reverse the operation. For example, unmounting a file system may take a long time, and it's hard to rollback, or treat that as reentrant. * Avoid breaking runtimes - Various runtimes do not GC when they are during a syscall (or while running native code that subsequently calls a syscall). If many notifications are blocked, and not picked up by the supervisor, this can get the application into a bad state. Signed-off-by: Sargun Dhillon <sargun@sargun.me> Signed-off-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/r/20220503080958.20220-2-sargun@sargun.me
133 lines
3.5 KiB
C
133 lines
3.5 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _LINUX_SECCOMP_H
|
|
#define _LINUX_SECCOMP_H
|
|
|
|
#include <uapi/linux/seccomp.h>
|
|
|
|
#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \
|
|
SECCOMP_FILTER_FLAG_LOG | \
|
|
SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
|
|
SECCOMP_FILTER_FLAG_NEW_LISTENER | \
|
|
SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
|
|
SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
|
|
|
|
/* sizeof() the first published struct seccomp_notif_addfd */
|
|
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
|
|
#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0
|
|
|
|
#ifdef CONFIG_SECCOMP
|
|
|
|
#include <linux/thread_info.h>
|
|
#include <linux/atomic.h>
|
|
#include <asm/seccomp.h>
|
|
|
|
struct seccomp_filter;
|
|
/**
|
|
* struct seccomp - the state of a seccomp'ed process
|
|
*
|
|
* @mode: indicates one of the valid values above for controlled
|
|
* system calls available to a process.
|
|
* @filter: must always point to a valid seccomp-filter or NULL as it is
|
|
* accessed without locking during system call entry.
|
|
*
|
|
* @filter must only be accessed from the context of current as there
|
|
* is no read locking.
|
|
*/
|
|
struct seccomp {
|
|
int mode;
|
|
atomic_t filter_count;
|
|
struct seccomp_filter *filter;
|
|
};
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
|
|
extern int __secure_computing(const struct seccomp_data *sd);
|
|
static inline int secure_computing(void)
|
|
{
|
|
if (unlikely(test_syscall_work(SECCOMP)))
|
|
return __secure_computing(NULL);
|
|
return 0;
|
|
}
|
|
#else
|
|
extern void secure_computing_strict(int this_syscall);
|
|
#endif
|
|
|
|
extern long prctl_get_seccomp(void);
|
|
extern long prctl_set_seccomp(unsigned long, void __user *);
|
|
|
|
static inline int seccomp_mode(struct seccomp *s)
|
|
{
|
|
return s->mode;
|
|
}
|
|
|
|
#else /* CONFIG_SECCOMP */
|
|
|
|
#include <linux/errno.h>
|
|
|
|
struct seccomp { };
|
|
struct seccomp_filter { };
|
|
struct seccomp_data;
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
|
|
static inline int secure_computing(void) { return 0; }
|
|
static inline int __secure_computing(const struct seccomp_data *sd) { return 0; }
|
|
#else
|
|
static inline void secure_computing_strict(int this_syscall) { return; }
|
|
#endif
|
|
|
|
static inline long prctl_get_seccomp(void)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline int seccomp_mode(struct seccomp *s)
|
|
{
|
|
return SECCOMP_MODE_DISABLED;
|
|
}
|
|
#endif /* CONFIG_SECCOMP */
|
|
|
|
#ifdef CONFIG_SECCOMP_FILTER
|
|
extern void seccomp_filter_release(struct task_struct *tsk);
|
|
extern void get_seccomp_filter(struct task_struct *tsk);
|
|
#else /* CONFIG_SECCOMP_FILTER */
|
|
static inline void seccomp_filter_release(struct task_struct *tsk)
|
|
{
|
|
return;
|
|
}
|
|
static inline void get_seccomp_filter(struct task_struct *tsk)
|
|
{
|
|
return;
|
|
}
|
|
#endif /* CONFIG_SECCOMP_FILTER */
|
|
|
|
#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
|
|
extern long seccomp_get_filter(struct task_struct *task,
|
|
unsigned long filter_off, void __user *data);
|
|
extern long seccomp_get_metadata(struct task_struct *task,
|
|
unsigned long filter_off, void __user *data);
|
|
#else
|
|
static inline long seccomp_get_filter(struct task_struct *task,
|
|
unsigned long n, void __user *data)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
static inline long seccomp_get_metadata(struct task_struct *task,
|
|
unsigned long filter_off,
|
|
void __user *data)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
|
|
|
|
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
|
struct seq_file;
|
|
|
|
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
|
|
struct pid *pid, struct task_struct *task);
|
|
#endif
|
|
#endif /* _LINUX_SECCOMP_H */
|