linux-stable/kernel/entry/common.c
Mark Rutland 99cf983cc8 sched/preempt: Add PREEMPT_DYNAMIC using static keys
Where an architecture selects HAVE_STATIC_CALL but not
HAVE_STATIC_CALL_INLINE, each static call has an out-of-line trampoline
which will either branch to a callee or return to the caller.

On such architectures, a number of constraints can conspire to make
those trampolines more complicated and potentially less useful than we'd
like. For example:

* Hardware and software control flow integrity schemes can require the
  addition of "landing pad" instructions (e.g. `BTI` for arm64), which
  will also be present at the "real" callee.

* Limited branch ranges can require that trampolines generate or load an
  address into a register and perform an indirect branch (or at least
  have a slow path that does so). This loses some of the benefits of
  having a direct branch.

* Interaction with SW CFI schemes can be complicated and fragile, e.g.
  requiring that we can recognise idiomatic codegen and remove
  indirections understand, at least until clang proves more helpful
  mechanisms for dealing with this.

For PREEMPT_DYNAMIC, we don't need the full power of static calls, as we
really only need to enable/disable specific preemption functions. We can
achieve the same effect without a number of the pain points above by
using static keys to fold early returns into the preemption functions
themselves rather than in an out-of-line trampoline, effectively
inlining the trampoline into the start of the function.

For arm64, this results in good code generation. For example, the
dynamic_cond_resched() wrapper looks as follows when enabled. When
disabled, the first `B` is replaced with a `NOP`, resulting in an early
return.

| <dynamic_cond_resched>:
|        bti     c
|        b       <dynamic_cond_resched+0x10>     // or `nop`
|        mov     w0, #0x0
|        ret
|        mrs     x0, sp_el0
|        ldr     x0, [x0, #8]
|        cbnz    x0, <dynamic_cond_resched+0x8>
|        paciasp
|        stp     x29, x30, [sp, #-16]!
|        mov     x29, sp
|        bl      <preempt_schedule_common>
|        mov     w0, #0x1
|        ldp     x29, x30, [sp], #16
|        autiasp
|        ret

... compared to the regular form of the function:

| <__cond_resched>:
|        bti     c
|        mrs     x0, sp_el0
|        ldr     x1, [x0, #8]
|        cbz     x1, <__cond_resched+0x18>
|        mov     w0, #0x0
|        ret
|        paciasp
|        stp     x29, x30, [sp, #-16]!
|        mov     x29, sp
|        bl      <preempt_schedule_common>
|        mov     w0, #0x1
|        ldp     x29, x30, [sp], #16
|        autiasp
|        ret

Any architecture which implements static keys should be able to use this
to implement PREEMPT_DYNAMIC with similar cost to non-inlined static
calls. Since this is likely to have greater overhead than (inlined)
static calls, PREEMPT_DYNAMIC is only defaulted to enabled when
HAVE_PREEMPT_DYNAMIC_CALL is selected.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/20220214165216.2231574-6-mark.rutland@arm.com
2022-02-19 11:11:08 +01:00

484 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
#include <linux/highmem.h>
#include <linux/jump_label.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
#include "common.h"
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
/* See comment for enter_from_user_mode() in entry-common.h */
static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
{
arch_check_user_regs(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
CT_WARN_ON(ct_state() != CONTEXT_USER);
user_exit_irqoff();
instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
}
void noinstr enter_from_user_mode(struct pt_regs *regs)
{
__enter_from_user_mode(regs);
}
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
unsigned long args[6];
syscall_get_arguments(current, regs, args);
audit_syscall_entry(syscall, args[0], args[1], args[2], args[3]);
}
}
static long syscall_trace_enter(struct pt_regs *regs, long syscall,
unsigned long work)
{
long ret = 0;
/*
* Handle Syscall User Dispatch. This must comes first, since
* the ABI here can be something that doesn't make sense for
* other syscall_work features.
*/
if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
if (syscall_user_dispatch(regs))
return -1L;
}
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
ret = arch_syscall_enter_tracehook(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
/* Do seccomp after ptrace, to catch any tracer changes. */
if (work & SYSCALL_WORK_SECCOMP) {
ret = __secure_computing(NULL);
if (ret == -1L)
return ret;
}
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, syscall);
syscall_enter_audit(regs, syscall);
return ret ? : syscall;
}
static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
if (work & SYSCALL_WORK_ENTER)
syscall = syscall_trace_enter(regs, syscall, work);
return syscall;
}
long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
{
return __syscall_enter_from_user_work(regs, syscall);
}
noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
{
long ret;
__enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
ret = __syscall_enter_from_user_work(regs, syscall);
instrumentation_end();
return ret;
}
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
__enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
/* See comment for exit_to_user_mode() in entry-common.h */
static __always_inline void __exit_to_user_mode(void)
{
instrumentation_begin();
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
user_enter_irqoff();
arch_exit_to_user_mode();
lockdep_hardirqs_on(CALLER_ADDR0);
}
void noinstr exit_to_user_mode(void)
{
__exit_to_user_mode();
}
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
{
if (ti_work & _TIF_NOTIFY_SIGNAL)
tracehook_notify_signal();
arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
}
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
* items have been completed.
*/
while (ti_work & EXIT_TO_USER_MODE_WORK) {
local_irq_enable_exit_to_user(ti_work);
if (ti_work & _TIF_NEED_RESCHED)
schedule();
if (ti_work & _TIF_UPROBE)
uprobe_notify_resume(regs);
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
handle_signal_work(regs, ti_work);
if (ti_work & _TIF_NOTIFY_RESUME)
tracehook_notify_resume(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
/*
* Disable interrupts and reevaluate the work flags as they
* might have changed while interrupts and preemption was
* enabled above.
*/
local_irq_disable_exit_to_user();
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
return ti_work;
}
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long ti_work = read_thread_flags();
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
tick_nohz_user_enter_prepare();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work);
/* Ensure that the address limit is intact and no locks are held */
addr_limit_user_check();
kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
}
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
* instruction has been already reported in syscall_enter_from_user_mode().
*/
static inline bool report_single_step(unsigned long work)
{
if (work & SYSCALL_WORK_SYSCALL_EMU)
return false;
return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP;
}
static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
bool step;
/*
* If the syscall was rolled back due to syscall user dispatching,
* then the tracers below are not invoked for the same reason as
* the entry side was not invoked in syscall_trace_enter(): The ABI
* of these syscalls is unknown.
*/
if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
if (unlikely(current->syscall_dispatch.on_dispatch)) {
current->syscall_dispatch.on_dispatch = false;
return;
}
}
audit_syscall_exit(regs);
if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
trace_sys_exit(regs, syscall_get_return_value(current, regs));
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
arch_syscall_exit_tracehook(regs, step);
}
/*
* Syscall specific exit to user mode preparation. Runs with interrupts
* enabled.
*/
static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
unsigned long nr = syscall_get_nr(current, regs);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
if (WARN(irqs_disabled(), "syscall %lu left IRQs disabled", nr))
local_irq_enable();
}
rseq_syscall(regs);
/*
* Do one-time syscall specific work. If these work items are
* enabled, we want to run them exactly once per syscall exit with
* interrupts enabled.
*/
if (unlikely(work & SYSCALL_WORK_EXIT))
syscall_exit_work(regs, work);
}
static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
syscall_exit_to_user_mode_prepare(regs);
local_irq_disable_exit_to_user();
exit_to_user_mode_prepare(regs);
}
void syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
__syscall_exit_to_user_mode_work(regs);
}
__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
__syscall_exit_to_user_mode_work(regs);
instrumentation_end();
__exit_to_user_mode();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
__enter_from_user_mode(regs);
}
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
{
instrumentation_begin();
exit_to_user_mode_prepare(regs);
instrumentation_end();
__exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
{
irqentry_state_t ret = {
.exit_rcu = false,
};
if (user_mode(regs)) {
irqentry_enter_from_user_mode(regs);
return ret;
}
/*
* If this entry hit the idle task invoke rcu_irq_enter() whether
* RCU is watching or not.
*
* Interrupts can nest when the first interrupt invokes softirq
* processing on return which enables interrupts.
*
* Scheduler ticks in the idle task can mark quiescent state and
* terminate a grace period, if and only if the timer interrupt is
* not nested into another interrupt.
*
* Checking for rcu_is_watching() here would prevent the nesting
* interrupt to invoke rcu_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interrupt and eventually claim
* quiescent state and end grace periods prematurely.
*
* Unconditionally invoke rcu_irq_enter() so RCU state stays
* consistent.
*
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
* as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
rcu_irq_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
instrumentation_end();
ret.exit_rcu = true;
return ret;
}
/*
* If RCU is watching then RCU only wants to check whether it needs
* to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
* already contains a warning when RCU is not watching, so no point
* in having another one here.
*/
lockdep_hardirqs_off(CALLER_ADDR0);
instrumentation_begin();
rcu_irq_enter_check_tick();
trace_hardirqs_off_finish();
instrumentation_end();
return ret;
}
void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
rcu_irq_exit_check_preempt();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
WARN_ON_ONCE(!on_thread_stack());
if (need_resched())
preempt_schedule_irq();
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
void dynamic_irqentry_exit_cond_resched(void)
{
if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
return;
raw_irqentry_exit_cond_resched();
}
#endif
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
{
lockdep_assert_irqs_disabled();
/* Check whether this returns to user mode */
if (user_mode(regs)) {
irqentry_exit_to_user_mode(regs);
} else if (!regs_irqs_disabled(regs)) {
/*
* If RCU was not watching on entry this needs to be done
* carefully and needs the same ordering of lockdep/tracing
* and RCU as the return to user mode path.
*/
if (state.exit_rcu) {
instrumentation_begin();
/* Tell the tracer that IRET will enable interrupts */
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
instrumentation_end();
rcu_irq_exit();
lockdep_hardirqs_on(CALLER_ADDR0);
return;
}
instrumentation_begin();
if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
} else {
/*
* IRQ flags state is correct already. Just tell RCU if it
* was not watching on entry.
*/
if (state.exit_rcu)
rcu_irq_exit();
}
}
irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
{
irqentry_state_t irq_state;
irq_state.lockdep = lockdep_hardirqs_enabled();
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
lockdep_hardirq_enter();
rcu_nmi_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
return irq_state;
}
void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
{
instrumentation_begin();
ftrace_nmi_exit();
if (irq_state.lockdep) {
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
}
instrumentation_end();
rcu_nmi_exit();
lockdep_hardirq_exit();
if (irq_state.lockdep)
lockdep_hardirqs_on(CALLER_ADDR0);
__nmi_exit();
}