linux-stable/include/linux/context_tracking.h
Frederic Weisbecker 179a9cf792 context_tracking: Don't implement exception_enter/exit() on CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK
The typical steps with context tracking are:

1) Task runs in userspace
2) Task enters the kernel (syscall/exception/IRQ)
3) Task switches from context tracking state CONTEXT_USER to
   CONTEXT_KERNEL (user_exit())
4) Task does stuff in kernel
5) Task switches from context tracking state CONTEXT_KERNEL to
   CONTEXT_USER (user_enter())
6) Task exits the kernel

If an exception fires between 5) and 6), the pt_regs and the context
tracking disagree on the context of the faulted/trapped instruction.
CONTEXT_KERNEL must be set before the exception handler, that's
unconditional for those handlers that want to be able to call into
schedule(), but CONTEXT_USER must be restored when the exception exits
whereas pt_regs tells that we are resuming to kernel space.

This can't be fixed with storing the context tracking state in a per-cpu
or per-task variable since another exception may fire onto the current
one and overwrite the saved state. Also the task can schedule. So it
has to be stored in a per task stack.

This is how exception_enter()/exception_exit() paper over the problem:

5) Task switches from context tracking state CONTEXT_KERNEL to
   CONTEXT_USER (user_enter())
5.1) Exception fires
5.2) prev_state = exception_enter() // save CONTEXT_USER to prev_state
                                    // and set CONTEXT_KERNEL
5.3) Exception handler
5.4) exception_enter(prev_state) // restore CONTEXT_USER
5.5) Exception resumes
6) Task exits the kernel

The condition to live without exception_enter()/exception_exit() is to
forbid exceptions and IRQs between 2) and 3) and between 5) and 6), or if
any is allowed to trigger, it won't call into context tracking, eg: NMIs,
and it won't schedule. These requirements are met by architectures
supporting CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK and those can
therefore afford not to implement this hack.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20201117151637.259084-3-frederic@kernel.org
2020-11-19 11:25:42 +01:00

181 lines
4.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CONTEXT_TRACKING_H
#define _LINUX_CONTEXT_TRACKING_H
#include <linux/sched.h>
#include <linux/vtime.h>
#include <linux/context_tracking_state.h>
#include <linux/instrumentation.h>
#include <asm/ptrace.h>
#ifdef CONFIG_CONTEXT_TRACKING
extern void context_tracking_cpu_set(int cpu);
/* Called with interrupts disabled. */
extern void __context_tracking_enter(enum ctx_state state);
extern void __context_tracking_exit(enum ctx_state state);
extern void context_tracking_enter(enum ctx_state state);
extern void context_tracking_exit(enum ctx_state state);
extern void context_tracking_user_enter(void);
extern void context_tracking_user_exit(void);
static inline void user_enter(void)
{
if (context_tracking_enabled())
context_tracking_enter(CONTEXT_USER);
}
static inline void user_exit(void)
{
if (context_tracking_enabled())
context_tracking_exit(CONTEXT_USER);
}
/* Called with interrupts disabled. */
static __always_inline void user_enter_irqoff(void)
{
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_USER);
}
static __always_inline void user_exit_irqoff(void)
{
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_USER);
}
static inline enum ctx_state exception_enter(void)
{
enum ctx_state prev_ctx;
if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) ||
!context_tracking_enabled())
return 0;
prev_ctx = this_cpu_read(context_tracking.state);
if (prev_ctx != CONTEXT_KERNEL)
context_tracking_exit(prev_ctx);
return prev_ctx;
}
static inline void exception_exit(enum ctx_state prev_ctx)
{
if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) &&
context_tracking_enabled()) {
if (prev_ctx != CONTEXT_KERNEL)
context_tracking_enter(prev_ctx);
}
}
/**
* ct_state() - return the current context tracking state if known
*
* Returns the current cpu's context tracking state if context tracking
* is enabled. If context tracking is disabled, returns
* CONTEXT_DISABLED. This should be used primarily for debugging.
*/
static __always_inline enum ctx_state ct_state(void)
{
return context_tracking_enabled() ?
this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
}
#else
static inline void user_enter(void) { }
static inline void user_exit(void) { }
static inline void user_enter_irqoff(void) { }
static inline void user_exit_irqoff(void) { }
static inline enum ctx_state exception_enter(void) { return 0; }
static inline void exception_exit(enum ctx_state prev_ctx) { }
static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
#endif /* !CONFIG_CONTEXT_TRACKING */
#define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
extern void context_tracking_init(void);
#else
static inline void context_tracking_init(void) { }
#endif /* CONFIG_CONTEXT_TRACKING_FORCE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
/* must be called with irqs disabled */
static __always_inline void guest_enter_irqoff(void)
{
instrumentation_begin();
if (vtime_accounting_enabled_this_cpu())
vtime_guest_enter(current);
else
current->flags |= PF_VCPU;
instrumentation_end();
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_GUEST);
/* KVM does not hold any references to rcu protected data when it
* switches CPU into a guest mode. In fact switching to a guest mode
* is very similar to exiting to userspace from rcu point of view. In
* addition CPU may stay in a guest mode for quite a long time (up to
* one time slice). Lets treat guest mode as quiescent state, just like
* we do with user-mode execution.
*/
if (!context_tracking_enabled_this_cpu()) {
instrumentation_begin();
rcu_virt_note_context_switch(smp_processor_id());
instrumentation_end();
}
}
static __always_inline void guest_exit_irqoff(void)
{
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_GUEST);
instrumentation_begin();
if (vtime_accounting_enabled_this_cpu())
vtime_guest_exit(current);
else
current->flags &= ~PF_VCPU;
instrumentation_end();
}
#else
static __always_inline void guest_enter_irqoff(void)
{
/*
* This is running in ioctl context so its safe
* to assume that it's the stime pending cputime
* to flush.
*/
instrumentation_begin();
vtime_account_kernel(current);
current->flags |= PF_VCPU;
rcu_virt_note_context_switch(smp_processor_id());
instrumentation_end();
}
static __always_inline void guest_exit_irqoff(void)
{
instrumentation_begin();
/* Flush the guest cputime we spent on the guest */
vtime_account_kernel(current);
current->flags &= ~PF_VCPU;
instrumentation_end();
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
static inline void guest_exit(void)
{
unsigned long flags;
local_irq_save(flags);
guest_exit_irqoff();
local_irq_restore(flags);
}
#endif