linux-stable/arch/x86/entry/common.c

/*
 * common.c - C code for kernel entry and exit
 * Copyright (c) 2015 Andrew Lutomirski
 * GPL v2
 *
 * Based on asm and ptrace code by many authors.  The code here originated
 * in ptrace.c and signal.c.
 */

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/errno.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
#include <linux/export.h>
#include <linux/context_tracking.h>
#include <linux/user-return-notifier.h>
#include <linux/uprobes.h>

#include <asm/desc.h>
#include <asm/traps.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible void enter_from_user_mode(void)
{
	CT_WARN_ON(ct_state() != CONTEXT_USER);
	user_exit();
}
#endif

static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
	if (arch == AUDIT_ARCH_X86_64) {
		audit_syscall_entry(regs->orig_ax, regs->di,
				    regs->si, regs->dx, regs->r10);
	} else
#endif
	{
		audit_syscall_entry(regs->orig_ax, regs->bx,
				    regs->cx, regs->dx, regs->si);
	}
}

/*
 * We can return 0 to resume the syscall or anything else to go to phase
 * 2.  If we resume the syscall, we need to put something appropriate in
 * regs->orig_ax.
 *
 * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
 * are fully functional.
 *
 * For phase 2's benefit, our return value is:
 * 0:			resume the syscall
 * 1:			go to phase 2; no seccomp phase 2 needed
 * anything else:	go to phase 2; pass return value to seccomp
 */
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{
	unsigned long ret = 0;
	u32 work;

	BUG_ON(regs != task_pt_regs(current));

	work = ACCESS_ONCE(current_thread_info()->flags) &
		_TIF_WORK_SYSCALL_ENTRY;

#ifdef CONFIG_CONTEXT_TRACKING
	/*
	 * If TIF_NOHZ is set, we are required to call user_exit() before
	 * doing anything that could touch RCU.
	 */
	if (work & _TIF_NOHZ) {
		enter_from_user_mode();
		work &= ~_TIF_NOHZ;
	}
#endif

#ifdef CONFIG_SECCOMP
	/*
	 * Do seccomp first -- it should minimize exposure of other
	 * code, and keeping seccomp fast is probably more valuable
	 * than the rest of this.
	 */
	if (work & _TIF_SECCOMP) {
		struct seccomp_data sd;

		sd.arch = arch;
		sd.nr = regs->orig_ax;
		sd.instruction_pointer = regs->ip;
#ifdef CONFIG_X86_64
		if (arch == AUDIT_ARCH_X86_64) {
			sd.args[0] = regs->di;
			sd.args[1] = regs->si;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->r10;
			sd.args[4] = regs->r8;
			sd.args[5] = regs->r9;
		} else
#endif
		{
			sd.args[0] = regs->bx;
			sd.args[1] = regs->cx;
			sd.args[2] = regs->dx;
			sd.args[3] = regs->si;
			sd.args[4] = regs->di;
			sd.args[5] = regs->bp;
		}

		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);

		ret = seccomp_phase1(&sd);
		if (ret == SECCOMP_PHASE1_SKIP) {
			regs->orig_ax = -1;
			ret = 0;
		} else if (ret != SECCOMP_PHASE1_OK) {
			return ret;  /* Go directly to phase 2 */
		}

		work &= ~_TIF_SECCOMP;
	}
#endif

	/* Do our best to finish without phase 2. */
	if (work == 0)
		return ret;  /* seccomp and/or nohz only (ret == 0 here) */

#ifdef CONFIG_AUDITSYSCALL
	if (work == _TIF_SYSCALL_AUDIT) {
		/*
		 * If there is no more work to be done except auditing,
		 * then audit in phase 1.  Phase 2 always audits, so, if
		 * we audit here, then we can't go on to phase 2.
		 */
		do_audit_syscall_entry(regs, arch);
		return 0;
	}
#endif

	return 1;  /* Something is enabled that we can't handle in phase 1 */
}

/* Returns the syscall nr to run (which should match regs->orig_ax). */
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
				unsigned long phase1_result)
{
	long ret = 0;
	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
		_TIF_WORK_SYSCALL_ENTRY;

	BUG_ON(regs != task_pt_regs(current));

	/*
	 * If we stepped into a sysenter/syscall insn, it trapped in
	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
	 * If user-mode had set TF itself, then it's still clear from
	 * do_debug() and we need to set it again to restore the user
	 * state.  If we entered on the slow path, TF was already set.
	 */
	if (work & _TIF_SINGLESTEP)
		regs->flags |= X86_EFLAGS_TF;

#ifdef CONFIG_SECCOMP
	/*
	 * Call seccomp_phase2 before running the other hooks so that
	 * they can see any changes made by a seccomp tracer.
	 */
	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
		/* seccomp failures shouldn't expose any additional code. */
		return -1;
	}
#endif

	if (unlikely(work & _TIF_SYSCALL_EMU))
		ret = -1L;

	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
	    tracehook_report_syscall_entry(regs))
		ret = -1L;

	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
		trace_sys_enter(regs, regs->orig_ax);

	do_audit_syscall_entry(regs, arch);

	return ret ?: regs->orig_ax;
}

long syscall_trace_enter(struct pt_regs *regs)
{
	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);

	if (phase1_result == 0)
		return regs->orig_ax;
	else
		return syscall_trace_enter_phase2(regs, arch, phase1_result);
}

void syscall_trace_leave(struct pt_regs *regs)
{
	bool step;

	/*
	 * We may come here right after calling schedule_user()
	 * or do_notify_resume(), in which case we can be in RCU
	 * user mode.
	 */
	user_exit();

	audit_syscall_exit(regs);

	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
		trace_sys_exit(regs, regs->ax);

	/*
	 * If TIF_SYSCALL_EMU is set, we only get here because of
	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
	 * We already reported this syscall instruction in
	 * syscall_trace_enter().
	 */
	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
			!test_thread_flag(TIF_SYSCALL_EMU);
	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
		tracehook_report_syscall_exit(regs, step);

	user_enter();
}

/*
 * notification of userspace execution resumption
 * - triggered by the TIF_WORK_MASK flags
 */
__visible void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
	user_exit();

	if (thread_info_flags & _TIF_UPROBE)
		uprobe_notify_resume(regs);

	/* deal with pending signal delivery */
	if (thread_info_flags & _TIF_SIGPENDING)
		do_signal(regs);

	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
		clear_thread_flag(TIF_NOTIFY_RESUME);
		tracehook_notify_resume(regs);
	}
	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
		fire_user_return_notifiers();

	user_enter();
}
x86/entry: Move C entry and exit code to arch/x86/entry/common.c The entry and exit C helpers were confusingly scattered between ptrace.c and signal.c, even though they aren't specific to ptrace or signal handling. Move them together in a new file. This change just moves code around. It doesn't change anything. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:23 +00:00			`/*`
			`* common.c - C code for kernel entry and exit`
			`* Copyright (c) 2015 Andrew Lutomirski`
			`* GPL v2`
			`*`
			`* Based on asm and ptrace code by many authors. The code here originated`
			`* in ptrace.c and signal.c.`
			`*/`

			`#include <linux/kernel.h>`
			`#include <linux/sched.h>`
			`#include <linux/mm.h>`
			`#include <linux/smp.h>`
			`#include <linux/errno.h>`
			`#include <linux/ptrace.h>`
			`#include <linux/tracehook.h>`
			`#include <linux/audit.h>`
			`#include <linux/seccomp.h>`
			`#include <linux/signal.h>`
			`#include <linux/export.h>`
			`#include <linux/context_tracking.h>`
			`#include <linux/user-return-notifier.h>`
			`#include <linux/uprobes.h>`

			`#include <asm/desc.h>`
			`#include <asm/traps.h>`

			`#define CREATE_TRACE_POINTS`
			`#include <trace/events/syscalls.h>`

x86/entry: Add enter_from_user_mode() and use it in syscalls Changing the x86 context tracking hooks is dangerous because there are no good checks that we track our context correctly. Add a helper to check that we're actually in CONTEXT_USER when we enter from user mode and wire it up for syscall entries. Subsequent patches will wire this up for all non-NMI entries as well. NMIs are their own special beast and cannot currently switch overall context tracking state. Instead, they have their own special RCU hooks. This is a tiny speedup if !CONFIG_CONTEXT_TRACKING (removes a branch) and a tiny slowdown if CONFIG_CONTEXT_TRACING (adds a layer of indirection). Eventually, we should fix up the core context tracking code to supply a function that does what we want (and can be much simpler than user_exit), which will enable us to get rid of the extra call. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/853b42420066ec3fb856779cdc223a6dcb5d355b.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:25 +00:00			`#ifdef CONFIG_CONTEXT_TRACKING`
			`/* Called on entry from user mode with IRQs off. */`
			`__visible void enter_from_user_mode(void)`
			`{`
			`CT_WARN_ON(ct_state() != CONTEXT_USER);`
			`user_exit();`
			`}`
			`#endif`

x86/entry: Move C entry and exit code to arch/x86/entry/common.c The entry and exit C helpers were confusingly scattered between ptrace.c and signal.c, even though they aren't specific to ptrace or signal handling. Move them together in a new file. This change just moves code around. It doesn't change anything. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:23 +00:00			`static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)`
			`{`
			`#ifdef CONFIG_X86_64`
			`if (arch == AUDIT_ARCH_X86_64) {`
			`audit_syscall_entry(regs->orig_ax, regs->di,`
			`regs->si, regs->dx, regs->r10);`
			`} else`
			`#endif`
			`{`
			`audit_syscall_entry(regs->orig_ax, regs->bx,`
			`regs->cx, regs->dx, regs->si);`
			`}`
			`}`

			`/*`
			`* We can return 0 to resume the syscall or anything else to go to phase`
			`* 2. If we resume the syscall, we need to put something appropriate in`
			`* regs->orig_ax.`
			`*`
			`* NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax`
			`* are fully functional.`
			`*`
			`* For phase 2's benefit, our return value is:`
			`* 0: resume the syscall`
			`* 1: go to phase 2; no seccomp phase 2 needed`
			`* anything else: go to phase 2; pass return value to seccomp`
			`*/`
			`unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)`
			`{`
			`unsigned long ret = 0;`
			`u32 work;`

			`BUG_ON(regs != task_pt_regs(current));`

			`work = ACCESS_ONCE(current_thread_info()->flags) &`
			`_TIF_WORK_SYSCALL_ENTRY;`

x86/entry: Add enter_from_user_mode() and use it in syscalls Changing the x86 context tracking hooks is dangerous because there are no good checks that we track our context correctly. Add a helper to check that we're actually in CONTEXT_USER when we enter from user mode and wire it up for syscall entries. Subsequent patches will wire this up for all non-NMI entries as well. NMIs are their own special beast and cannot currently switch overall context tracking state. Instead, they have their own special RCU hooks. This is a tiny speedup if !CONFIG_CONTEXT_TRACKING (removes a branch) and a tiny slowdown if CONFIG_CONTEXT_TRACING (adds a layer of indirection). Eventually, we should fix up the core context tracking code to supply a function that does what we want (and can be much simpler than user_exit), which will enable us to get rid of the extra call. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/853b42420066ec3fb856779cdc223a6dcb5d355b.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:25 +00:00			`#ifdef CONFIG_CONTEXT_TRACKING`
x86/entry: Move C entry and exit code to arch/x86/entry/common.c The entry and exit C helpers were confusingly scattered between ptrace.c and signal.c, even though they aren't specific to ptrace or signal handling. Move them together in a new file. This change just moves code around. It doesn't change anything. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:23 +00:00			`/*`
			`* If TIF_NOHZ is set, we are required to call user_exit() before`
			`* doing anything that could touch RCU.`
			`*/`
			`if (work & _TIF_NOHZ) {`
x86/entry: Add enter_from_user_mode() and use it in syscalls Changing the x86 context tracking hooks is dangerous because there are no good checks that we track our context correctly. Add a helper to check that we're actually in CONTEXT_USER when we enter from user mode and wire it up for syscall entries. Subsequent patches will wire this up for all non-NMI entries as well. NMIs are their own special beast and cannot currently switch overall context tracking state. Instead, they have their own special RCU hooks. This is a tiny speedup if !CONFIG_CONTEXT_TRACKING (removes a branch) and a tiny slowdown if CONFIG_CONTEXT_TRACING (adds a layer of indirection). Eventually, we should fix up the core context tracking code to supply a function that does what we want (and can be much simpler than user_exit), which will enable us to get rid of the extra call. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/853b42420066ec3fb856779cdc223a6dcb5d355b.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:25 +00:00			`enter_from_user_mode();`
x86/entry: Move C entry and exit code to arch/x86/entry/common.c The entry and exit C helpers were confusingly scattered between ptrace.c and signal.c, even though they aren't specific to ptrace or signal handling. Move them together in a new file. This change just moves code around. It doesn't change anything. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:23 +00:00			`work &= ~_TIF_NOHZ;`
			`}`
x86/entry: Add enter_from_user_mode() and use it in syscalls Changing the x86 context tracking hooks is dangerous because there are no good checks that we track our context correctly. Add a helper to check that we're actually in CONTEXT_USER when we enter from user mode and wire it up for syscall entries. Subsequent patches will wire this up for all non-NMI entries as well. NMIs are their own special beast and cannot currently switch overall context tracking state. Instead, they have their own special RCU hooks. This is a tiny speedup if !CONFIG_CONTEXT_TRACKING (removes a branch) and a tiny slowdown if CONFIG_CONTEXT_TRACING (adds a layer of indirection). Eventually, we should fix up the core context tracking code to supply a function that does what we want (and can be much simpler than user_exit), which will enable us to get rid of the extra call. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/853b42420066ec3fb856779cdc223a6dcb5d355b.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:25 +00:00			`#endif`
x86/entry: Move C entry and exit code to arch/x86/entry/common.c The entry and exit C helpers were confusingly scattered between ptrace.c and signal.c, even though they aren't specific to ptrace or signal handling. Move them together in a new file. This change just moves code around. It doesn't change anything. Signed-off-by: Andy Lutomirski <luto@kernel.org> Cc: Andy Lutomirski <luto@amacapital.net> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: Denys Vlasenko <vda.linux@googlemail.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Kees Cook <keescook@chromium.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rik van Riel <riel@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> 2015-07-03 19:44:23 +00:00
			`#ifdef CONFIG_SECCOMP`
			`/*`
			`* Do seccomp first -- it should minimize exposure of other`
			`* code, and keeping seccomp fast is probably more valuable`
			`* than the rest of this.`
			`*/`
			`if (work & _TIF_SECCOMP) {`
			`struct seccomp_data sd;`

			`sd.arch = arch;`
			`sd.nr = regs->orig_ax;`
			`sd.instruction_pointer = regs->ip;`
			`#ifdef CONFIG_X86_64`
			`if (arch == AUDIT_ARCH_X86_64) {`
			`sd.args[0] = regs->di;`
			`sd.args[1] = regs->si;`
			`sd.args[2] = regs->dx;`
			`sd.args[3] = regs->r10;`
			`sd.args[4] = regs->r8;`
			`sd.args[5] = regs->r9;`
			`} else`
			`#endif`
			`{`
			`sd.args[0] = regs->bx;`
			`sd.args[1] = regs->cx;`
			`sd.args[2] = regs->dx;`
			`sd.args[3] = regs->si;`
			`sd.args[4] = regs->di;`
			`sd.args[5] = regs->bp;`
			`}`

			`BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);`
			`BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);`

			`ret = seccomp_phase1(&sd);`
			`if (ret == SECCOMP_PHASE1_SKIP) {`
			`regs->orig_ax = -1;`
			`ret = 0;`
			`} else if (ret != SECCOMP_PHASE1_OK) {`
			`return ret; /* Go directly to phase 2 */`
			`}`

			`work &= ~_TIF_SECCOMP;`
			`}`
			`#endif`

			`/* Do our best to finish without phase 2. */`
			`if (work == 0)`
			`return ret; /* seccomp and/or nohz only (ret == 0 here) */`

			`#ifdef CONFIG_AUDITSYSCALL`
			`if (work == _TIF_SYSCALL_AUDIT) {`
			`/*`
			`* If there is no more work to be done except auditing,`
			`* then audit in phase 1. Phase 2 always audits, so, if`
			`* we audit here, then we can't go on to phase 2.`
			`*/`
			`do_audit_syscall_entry(regs, arch);`
			`return 0;`
			`}`
			`#endif`

			`return 1; /* Something is enabled that we can't handle in phase 1 */`
			`}`

			`/* Returns the syscall nr to run (which should match regs->orig_ax). */`
			`long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,`
			`unsigned long phase1_result)`
			`{`
			`long ret = 0;`
			`u32 work = ACCESS_ONCE(current_thread_info()->flags) &`
			`_TIF_WORK_SYSCALL_ENTRY;`

			`BUG_ON(regs != task_pt_regs(current));`

			`/*`
			`* If we stepped into a sysenter/syscall insn, it trapped in`
			`* kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.`
			`* If user-mode had set TF itself, then it's still clear from`
			`* do_debug() and we need to set it again to restore the user`
			`* state. If we entered on the slow path, TF was already set.`
			`*/`
			`if (work & _TIF_SINGLESTEP)`
			`regs->flags \|= X86_EFLAGS_TF;`

			`#ifdef CONFIG_SECCOMP`
			`/*`
			`* Call seccomp_phase2 before running the other hooks so that`
			`* they can see any changes made by a seccomp tracer.`
			`*/`
			`if (phase1_result > 1 && seccomp_phase2(phase1_result)) {`
			`/* seccomp failures shouldn't expose any additional code. */`
			`return -1;`
			`}`
			`#endif`

			`if (unlikely(work & _TIF_SYSCALL_EMU))`
			`ret = -1L;`

			`if ((ret \|\| test_thread_flag(TIF_SYSCALL_TRACE)) &&`
			`tracehook_report_syscall_entry(regs))`
			`ret = -1L;`

			`if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))`
			`trace_sys_enter(regs, regs->orig_ax);`

			`do_audit_syscall_entry(regs, arch);`

			`return ret ?: regs->orig_ax;`
			`}`

			`long syscall_trace_enter(struct pt_regs *regs)`
			`{`
			`u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;`
			`unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);`

			`if (phase1_result == 0)`
			`return regs->orig_ax;`
			`else`
			`return syscall_trace_enter_phase2(regs, arch, phase1_result);`
			`}`

			`void syscall_trace_leave(struct pt_regs *regs)`
			`{`
			`bool step;`

			`/*`
			`* We may come here right after calling schedule_user()`
			`* or do_notify_resume(), in which case we can be in RCU`
			`* user mode.`
			`*/`
			`user_exit();`

			`audit_syscall_exit(regs);`

			`if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))`
			`trace_sys_exit(regs, regs->ax);`

			`/*`
			`* If TIF_SYSCALL_EMU is set, we only get here because of`
			`* TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).`
			`* We already reported this syscall instruction in`
			`* syscall_trace_enter().`
			`*/`
			`step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&`
			`!test_thread_flag(TIF_SYSCALL_EMU);`
			`if (step \|\| test_thread_flag(TIF_SYSCALL_TRACE))`
			`tracehook_report_syscall_exit(regs, step);`

			`user_enter();`
			`}`

			`/*`
			`* notification of userspace execution resumption`
			`* - triggered by the TIF_WORK_MASK flags`
			`*/`
			`__visible void`
			`do_notify_resume(struct pt_regs regs, void unused, __u32 thread_info_flags)`
			`{`
			`user_exit();`

			`if (thread_info_flags & _TIF_UPROBE)`
			`uprobe_notify_resume(regs);`

			`/* deal with pending signal delivery */`
			`if (thread_info_flags & _TIF_SIGPENDING)`
			`do_signal(regs);`

			`if (thread_info_flags & _TIF_NOTIFY_RESUME) {`
			`clear_thread_flag(TIF_NOTIFY_RESUME);`
			`tracehook_notify_resume(regs);`
			`}`
			`if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)`
			`fire_user_return_notifiers();`

			`user_enter();`
			`}`