linux-stable/include/linux/thread_info.h
Gabriel Krisman Bertazi 1446e1df9e kernel: Implement selective syscall userspace redirection
Introduce a mechanism to quickly disable/enable syscall handling for a
specific process and redirect to userspace via SIGSYS.  This is useful
for processes with parts that require syscall redirection and parts that
don't, but who need to perform this boundary crossing really fast,
without paying the cost of a system call to reconfigure syscall handling
on each boundary transition.  This is particularly important for Windows
games running over Wine.

The proposed interface looks like this:

  prctl(PR_SET_SYSCALL_USER_DISPATCH, <op>, <off>, <length>, [selector])

The range [<offset>,<offset>+<length>) is a part of the process memory
map that is allowed to by-pass the redirection code and dispatch
syscalls directly, such that in fast paths a process doesn't need to
disable the trap nor the kernel has to check the selector.  This is
essential to return from SIGSYS to a blocked area without triggering
another SIGSYS from rt_sigreturn.

selector is an optional pointer to a char-sized userspace memory region
that has a key switch for the mechanism. This key switch is set to
either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the
redirection without calling the kernel.

The feature is meant to be set per-thread and it is disabled on
fork/clone/execv.

Internally, this doesn't add overhead to the syscall hot path, and it
requires very little per-architecture support.  I avoided using seccomp,
even though it duplicates some functionality, due to previous feedback
that maybe it shouldn't mix with seccomp since it is not a security
mechanism.  And obviously, this should never be considered a security
mechanism, since any part of the program can by-pass it by using the
syscall dispatcher.

For the sysinfo benchmark, which measures the overhead added to
executing a native syscall that doesn't require interception, the
overhead using only the direct dispatcher region to issue syscalls is
pretty much irrelevant.  The overhead of using the selector goes around
40ns for a native (unredirected) syscall in my system, and it is (as
expected) dominated by the supervisor-mode user-address access.  In
fact, with SMAP off, the overhead is consistently less than 5ns on my
test box.

Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com
2020-12-02 15:07:56 +01:00

212 lines
6.1 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
*
* Copyright (C) 2002 David Howells (dhowells@redhat.com)
* - Incorporating suggestions made by Linus Torvalds
*/
#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H
#include <linux/types.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
* definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
* including <asm/current.h> can cause a circular dependency on some platforms.
*/
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif
#include <linux/bitops.h>
/*
* For per-arch arch_within_stack_frames() implementations, defined in
* asm/thread_info.h.
*/
enum {
BAD_STACK = -1,
NOT_STACK = 0,
GOOD_FRAME,
GOOD_STACK,
};
#ifdef CONFIG_GENERIC_ENTRY
enum syscall_work_bit {
SYSCALL_WORK_BIT_SECCOMP,
SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
SYSCALL_WORK_BIT_SYSCALL_TRACE,
SYSCALL_WORK_BIT_SYSCALL_EMU,
SYSCALL_WORK_BIT_SYSCALL_AUDIT,
SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
};
#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP)
#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
#endif
#include <asm/thread_info.h>
#ifdef __KERNEL__
#ifndef THREAD_ALIGN
#define THREAD_ALIGN THREAD_SIZE
#endif
#define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
/*
* flag set/clear/test wrappers
* - pass TIF_xxxx constants to these functions
*/
static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
set_bit(flag, (unsigned long *)&ti->flags);
}
static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
clear_bit(flag, (unsigned long *)&ti->flags);
}
static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
bool value)
{
if (value)
set_ti_thread_flag(ti, flag);
else
clear_ti_thread_flag(ti, flag);
}
static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}
static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}
static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
return test_bit(flag, (unsigned long *)&ti->flags);
}
#define set_thread_flag(flag) \
set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
test_ti_thread_flag(current_thread_info(), flag)
#ifdef CONFIG_GENERIC_ENTRY
#define set_syscall_work(fl) \
set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define test_syscall_work(fl) \
test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define clear_syscall_work(fl) \
clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define set_task_syscall_work(t, fl) \
set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define test_task_syscall_work(t, fl) \
test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define clear_task_syscall_work(t, fl) \
clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#else /* CONFIG_GENERIC_ENTRY */
#define set_syscall_work(fl) \
set_ti_thread_flag(current_thread_info(), TIF_##fl)
#define test_syscall_work(fl) \
test_ti_thread_flag(current_thread_info(), TIF_##fl)
#define clear_syscall_work(fl) \
clear_ti_thread_flag(current_thread_info(), TIF_##fl)
#define set_task_syscall_work(t, fl) \
set_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define test_task_syscall_work(t, fl) \
test_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define clear_task_syscall_work(t, fl) \
clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */
#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED)
#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
const void * const stackend,
const void *obj, unsigned long len)
{
return 0;
}
#endif
#ifdef CONFIG_HARDENED_USERCOPY
extern void __check_object_size(const void *ptr, unsigned long n,
bool to_user);
static __always_inline void check_object_size(const void *ptr, unsigned long n,
bool to_user)
{
if (!__builtin_constant_p(n))
__check_object_size(ptr, n, to_user);
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */
extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);
static inline void copy_overflow(int size, unsigned long count)
{
WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}
static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
int sz = __compiletime_object_size(addr);
if (unlikely(sz >= 0 && sz < bytes)) {
if (!__builtin_constant_p(bytes))
copy_overflow(sz, bytes);
else if (is_source)
__bad_copy_from();
else
__bad_copy_to();
return false;
}
if (WARN_ON_ONCE(bytes > INT_MAX))
return false;
check_object_size(addr, bytes, is_source);
return true;
}
#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif
#endif /* __KERNEL__ */
#endif /* _LINUX_THREAD_INFO_H */