linux-stable/include/linux/seccomp.h
Sargun Dhillon c2aa2dfef2 seccomp: Add wait_killable semantic to seccomp user notifier
This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
that makes it so that when notifications are received by the supervisor the
notifying process will transition to wait killable semantics. Although wait
killable isn't a set of semantics formally exposed to userspace, the
concept is searchable. If the notifying process is signaled prior to the
notification being received by the userspace agent, it will be handled as
normal.

One quirk about how this is handled is that the notifying process
only switches to TASK_KILLABLE if it receives a wakeup from either
an addfd or a signal. This is to avoid an unnecessary wakeup of
the notifying task.

The reasons behind switching into wait_killable only after userspace
receives the notification are:
* Avoiding unncessary work - Often, workloads will perform work that they
  may abort (request racing comes to mind). This allows for syscalls to be
  aborted safely prior to the notification being received by the
  supervisor. In this, the supervisor doesn't end up doing work that the
  workload does not want to complete anyways.
* Avoiding side effects - We don't want the syscall to be interruptible
  once the supervisor starts doing work because it may not be trivial
  to reverse the operation. For example, unmounting a file system may
  take a long time, and it's hard to rollback, or treat that as
  reentrant.
* Avoid breaking runtimes - Various runtimes do not GC when they are
  during a syscall (or while running native code that subsequently
  calls a syscall). If many notifications are blocked, and not picked
  up by the supervisor, this can get the application into a bad state.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220503080958.20220-2-sargun@sargun.me
2022-05-03 14:11:58 -07:00

132 lines
3.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SECCOMP_H
#define _LINUX_SECCOMP_H
#include <uapi/linux/seccomp.h>
#define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \
SECCOMP_FILTER_FLAG_LOG | \
SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
SECCOMP_FILTER_FLAG_NEW_LISTENER | \
SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
/* sizeof() the first published struct seccomp_notif_addfd */
#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0
#ifdef CONFIG_SECCOMP
#include <linux/thread_info.h>
#include <linux/atomic.h>
#include <asm/seccomp.h>
struct seccomp_filter;
/**
* struct seccomp - the state of a seccomp'ed process
*
* @mode: indicates one of the valid values above for controlled
* system calls available to a process.
* @filter: must always point to a valid seccomp-filter or NULL as it is
* accessed without locking during system call entry.
*
* @filter must only be accessed from the context of current as there
* is no read locking.
*/
struct seccomp {
int mode;
atomic_t filter_count;
struct seccomp_filter *filter;
};
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
extern int __secure_computing(const struct seccomp_data *sd);
static inline int secure_computing(void)
{
if (unlikely(test_syscall_work(SECCOMP)))
return __secure_computing(NULL);
return 0;
}
#else
extern void secure_computing_strict(int this_syscall);
#endif
extern long prctl_get_seccomp(void);
extern long prctl_set_seccomp(unsigned long, void __user *);
static inline int seccomp_mode(struct seccomp *s)
{
return s->mode;
}
#else /* CONFIG_SECCOMP */
#include <linux/errno.h>
struct seccomp { };
struct seccomp_filter { };
struct seccomp_data;
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
static inline int secure_computing(void) { return 0; }
static inline int __secure_computing(const struct seccomp_data *sd) { return 0; }
#else
static inline void secure_computing_strict(int this_syscall) { return; }
#endif
static inline long prctl_get_seccomp(void)
{
return -EINVAL;
}
static inline long prctl_set_seccomp(unsigned long arg2, char __user *arg3)
{
return -EINVAL;
}
static inline int seccomp_mode(struct seccomp *s)
{
return SECCOMP_MODE_DISABLED;
}
#endif /* CONFIG_SECCOMP */
#ifdef CONFIG_SECCOMP_FILTER
extern void seccomp_filter_release(struct task_struct *tsk);
extern void get_seccomp_filter(struct task_struct *tsk);
#else /* CONFIG_SECCOMP_FILTER */
static inline void seccomp_filter_release(struct task_struct *tsk)
{
return;
}
static inline void get_seccomp_filter(struct task_struct *tsk)
{
return;
}
#endif /* CONFIG_SECCOMP_FILTER */
#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
extern long seccomp_get_filter(struct task_struct *task,
unsigned long filter_off, void __user *data);
extern long seccomp_get_metadata(struct task_struct *task,
unsigned long filter_off, void __user *data);
#else
static inline long seccomp_get_filter(struct task_struct *task,
unsigned long n, void __user *data)
{
return -EINVAL;
}
static inline long seccomp_get_metadata(struct task_struct *task,
unsigned long filter_off,
void __user *data)
{
return -EINVAL;
}
#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
struct seq_file;
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
#endif
#endif /* _LINUX_SECCOMP_H */