linux-stable/include/uapi/linux/seccomp.h
Sargun Dhillon c2aa2dfef2 seccomp: Add wait_killable semantic to seccomp user notifier
This introduces a per-filter flag (SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
that makes it so that when notifications are received by the supervisor the
notifying process will transition to wait killable semantics. Although wait
killable isn't a set of semantics formally exposed to userspace, the
concept is searchable. If the notifying process is signaled prior to the
notification being received by the userspace agent, it will be handled as
normal.

One quirk about how this is handled is that the notifying process
only switches to TASK_KILLABLE if it receives a wakeup from either
an addfd or a signal. This is to avoid an unnecessary wakeup of
the notifying task.

The reasons behind switching into wait_killable only after userspace
receives the notification are:
* Avoiding unncessary work - Often, workloads will perform work that they
  may abort (request racing comes to mind). This allows for syscalls to be
  aborted safely prior to the notification being received by the
  supervisor. In this, the supervisor doesn't end up doing work that the
  workload does not want to complete anyways.
* Avoiding side effects - We don't want the syscall to be interruptible
  once the supervisor starts doing work because it may not be trivial
  to reverse the operation. For example, unmounting a file system may
  take a long time, and it's hard to rollback, or treat that as
  reentrant.
* Avoid breaking runtimes - Various runtimes do not GC when they are
  during a syscall (or while running native code that subsequently
  calls a syscall). If many notifications are blocked, and not picked
  up by the supervisor, this can get the application into a bad state.

Signed-off-by: Sargun Dhillon <sargun@sargun.me>
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20220503080958.20220-2-sargun@sargun.me
2022-05-03 14:11:58 -07:00

153 lines
5.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_SECCOMP_H
#define _UAPI_LINUX_SECCOMP_H
#include <linux/compiler.h>
#include <linux/types.h>
/* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, <mode>) */
#define SECCOMP_MODE_DISABLED 0 /* seccomp is not in use. */
#define SECCOMP_MODE_STRICT 1 /* uses hard-coded filter. */
#define SECCOMP_MODE_FILTER 2 /* uses user-supplied filter. */
/* Valid operations for seccomp syscall. */
#define SECCOMP_SET_MODE_STRICT 0
#define SECCOMP_SET_MODE_FILTER 1
#define SECCOMP_GET_ACTION_AVAIL 2
#define SECCOMP_GET_NOTIF_SIZES 3
/* Valid flags for SECCOMP_SET_MODE_FILTER */
#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
#define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
/* Received notifications wait in killable state (only respond to fatal signals) */
#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
/*
* All BPF programs must return a 32-bit value.
* The bottom 16-bits are for optional return data.
* The upper 16-bits are ordered from least permissive values to most,
* as a signed value (so 0x8000000 is negative).
*
* The ordering ensures that a min_t() over composed return values always
* selects the least permissive choice.
*/
#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */
#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD
#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */
#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */
#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
/* Masks for the return value sections. */
#define SECCOMP_RET_ACTION_FULL 0xffff0000U
#define SECCOMP_RET_ACTION 0x7fff0000U
#define SECCOMP_RET_DATA 0x0000ffffU
/**
* struct seccomp_data - the format the BPF program executes over.
* @nr: the system call number
* @arch: indicates system call convention as an AUDIT_ARCH_* value
* as defined in <linux/audit.h>.
* @instruction_pointer: at the time of the system call.
* @args: up to 6 system call arguments always stored as 64-bit values
* regardless of the architecture.
*/
struct seccomp_data {
int nr;
__u32 arch;
__u64 instruction_pointer;
__u64 args[6];
};
struct seccomp_notif_sizes {
__u16 seccomp_notif;
__u16 seccomp_notif_resp;
__u16 seccomp_data;
};
struct seccomp_notif {
__u64 id;
__u32 pid;
__u32 flags;
struct seccomp_data data;
};
/*
* Valid flags for struct seccomp_notif_resp
*
* Note, the SECCOMP_USER_NOTIF_FLAG_CONTINUE flag must be used with caution!
* If set by the process supervising the syscalls of another process the
* syscall will continue. This is problematic because of an inherent TOCTOU.
* An attacker can exploit the time while the supervised process is waiting on
* a response from the supervising process to rewrite syscall arguments which
* are passed as pointers of the intercepted syscall.
* It should be absolutely clear that this means that the seccomp notifier
* _cannot_ be used to implement a security policy! It should only ever be used
* in scenarios where a more privileged process supervises the syscalls of a
* lesser privileged process to get around kernel-enforced security
* restrictions when the privileged process deems this safe. In other words,
* in order to continue a syscall the supervising process should be sure that
* another security mechanism or the kernel itself will sufficiently block
* syscalls if arguments are rewritten to something unsafe.
*
* Similar precautions should be applied when stacking SECCOMP_RET_USER_NOTIF
* or SECCOMP_RET_TRACE. For SECCOMP_RET_USER_NOTIF filters acting on the
* same syscall, the most recently added filter takes precedence. This means
* that the new SECCOMP_RET_USER_NOTIF filter can override any
* SECCOMP_IOCTL_NOTIF_SEND from earlier filters, essentially allowing all
* such filtered syscalls to be executed by sending the response
* SECCOMP_USER_NOTIF_FLAG_CONTINUE. Note that SECCOMP_RET_TRACE can equally
* be overriden by SECCOMP_USER_NOTIF_FLAG_CONTINUE.
*/
#define SECCOMP_USER_NOTIF_FLAG_CONTINUE (1UL << 0)
struct seccomp_notif_resp {
__u64 id;
__s64 val;
__s32 error;
__u32 flags;
};
/* valid flags for seccomp_notif_addfd */
#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
/**
* struct seccomp_notif_addfd
* @id: The ID of the seccomp notification
* @flags: SECCOMP_ADDFD_FLAG_*
* @srcfd: The local fd number
* @newfd: Optional remote FD number if SETFD option is set, otherwise 0.
* @newfd_flags: The O_* flags the remote FD should have applied
*/
struct seccomp_notif_addfd {
__u64 id;
__u32 flags;
__u32 srcfd;
__u32 newfd;
__u32 newfd_flags;
};
#define SECCOMP_IOC_MAGIC '!'
#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type)
#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type)
#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
/* Flags for seccomp notification fd ioctl. */
#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \
struct seccomp_notif_resp)
#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64)
/* On success, the return value is the remote process's added fd number */
#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
struct seccomp_notif_addfd)
#endif /* _UAPI_LINUX_SECCOMP_H */