Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Thomas Gleixner:
 "An unfortunately larger set of fixes, but a large portion is
  selftests:

   - Fix the missing clusterid initializaiton for x2apic cluster
     management which caused boot failures due to IPIs being sent to the
     wrong cluster

   - Drop TX_COMPAT when a 64bit executable is exec()'ed from a compat
     task

   - Wrap access to __supported_pte_mask in __startup_64() where clang
     compile fails due to a non PC relative access being generated.

   - Two fixes for 5 level paging fallout in the decompressor:

      - Handle GOT correctly for paging_prepare() and
        cleanup_trampoline()

      - Fix the page table handling in cleanup_trampoline() to avoid
        page table corruption.

   - Stop special casing protection key 0 as this is inconsistent with
     the manpage and also inconsistent with the allocation map handling.

   - Override the protection key wen moving away from PROT_EXEC to
     prevent inaccessible memory.

   - Fix and update the protection key selftests to address breakage and
     to cover the above issue

   - Add a MOV SS self test"

[ Part of the x86 fixes were in the earlier core pull due to dependencies ]

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
  x86/mm: Drop TS_COMPAT on 64-bit exec() syscall
  x86/apic/x2apic: Initialize cluster ID properly
  x86/boot/compressed/64: Fix moving page table out of trampoline memory
  x86/boot/compressed/64: Set up GOT for paging_prepare() and cleanup_trampoline()
  x86/pkeys: Do not special case protection key 0
  x86/pkeys/selftests: Add a test for pkey 0
  x86/pkeys/selftests: Save off 'prot' for allocations
  x86/pkeys/selftests: Fix pointer math
  x86/pkeys: Override pkey when moving away from PROT_EXEC
  x86/pkeys/selftests: Fix pkey exhaustion test off-by-one
  x86/pkeys/selftests: Add PROT_EXEC test
  x86/pkeys/selftests: Factor out "instruction page"
  x86/pkeys/selftests: Allow faults on unknown keys
  x86/pkeys/selftests: Avoid printf-in-signal deadlocks
  x86/pkeys/selftests: Remove dead debugging code, fix dprint_in_signal
  x86/pkeys/selftests: Stop using assert()
  x86/pkeys/selftests: Give better unexpected fault error messages
  x86/selftests: Add mov_to_ss test
  x86/mpx/selftests: Adjust the self-test to fresh distros that export the MPX ABI
  x86/pkeys/selftests: Adjust the self-test to fresh distros that export the pkeys ABI
  ...
This commit is contained in:
Linus Torvalds 2018-05-20 11:28:32 -07:00
commit 8a6bd2f40e
13 changed files with 585 additions and 129 deletions

View file

@ -305,6 +305,25 @@ ENTRY(startup_64)
/* Set up the stack */
leaq boot_stack_end(%rbx), %rsp
/*
* paging_prepare() and cleanup_trampoline() below can have GOT
* references. Adjust the table with address we are running at.
*
* Zero RAX for adjust_got: the GOT was not adjusted before;
* there's no adjustment to undo.
*/
xorq %rax, %rax
/*
* Calculate the address the binary is loaded at and use it as
* a GOT adjustment.
*/
call 1f
1: popq %rdi
subq $1b, %rdi
call adjust_got
/*
* At this point we are in long mode with 4-level paging enabled,
* but we might want to enable 5-level paging or vice versa.
@ -370,10 +389,14 @@ trampoline_return:
/*
* cleanup_trampoline() would restore trampoline memory.
*
* RDI is address of the page table to use instead of page table
* in trampoline memory (if required).
*
* RSI holds real mode data and needs to be preserved across
* this function call.
*/
pushq %rsi
leaq top_pgtable(%rbx), %rdi
call cleanup_trampoline
popq %rsi
@ -381,6 +404,21 @@ trampoline_return:
pushq $0
popfq
/*
* Previously we've adjusted the GOT with address the binary was
* loaded at. Now we need to re-adjust for relocation address.
*
* Calculate the address the binary is loaded at, so that we can
* undo the previous GOT adjustment.
*/
call 1f
1: popq %rax
subq $1b, %rax
/* The new adjustment is the relocation address */
movq %rbx, %rdi
call adjust_got
/*
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
@ -481,19 +519,6 @@ relocated:
shrq $3, %rcx
rep stosq
/*
* Adjust our own GOT
*/
leaq _got(%rip), %rdx
leaq _egot(%rip), %rcx
1:
cmpq %rcx, %rdx
jae 2f
addq %rbx, (%rdx)
addq $8, %rdx
jmp 1b
2:
/*
* Do the extraction, and jump to the new kernel..
*/
@ -512,6 +537,27 @@ relocated:
*/
jmp *%rax
/*
* Adjust the global offset table
*
* RAX is the previous adjustment of the table to undo (use 0 if it's the
* first time we touch GOT).
* RDI is the new adjustment to apply.
*/
adjust_got:
/* Walk through the GOT adding the address to the entries */
leaq _got(%rip), %rdx
leaq _egot(%rip), %rcx
1:
cmpq %rcx, %rdx
jae 2f
subq %rax, (%rdx) /* Undo previous adjustment */
addq %rdi, (%rdx) /* Apply the new adjustment */
addq $8, %rdx
jmp 1b
2:
ret
.code32
/*
* This is the 32-bit trampoline that will be copied over to low memory.
@ -649,3 +695,10 @@ boot_stack_end:
.balign 4096
pgtable:
.fill BOOT_PGT_SIZE, 1, 0
/*
* The page table is going to be used instead of page table in the trampoline
* memory.
*/
top_pgtable:
.fill PAGE_SIZE, 1, 0

View file

@ -22,14 +22,6 @@ struct paging_config {
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
/*
* The page table is going to be used instead of page table in the trampoline
* memory.
*
* It must not be in BSS as BSS is cleared after cleanup_trampoline().
*/
static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data);
/*
* Trampoline address will be printed by extract_kernel() for debugging
* purposes.
@ -134,7 +126,7 @@ struct paging_config paging_prepare(void)
return paging_config;
}
void cleanup_trampoline(void)
void cleanup_trampoline(void *pgtable)
{
void *trampoline_pgtable;
@ -145,8 +137,8 @@ void cleanup_trampoline(void)
* if it's there.
*/
if ((void *)__native_read_cr3() == trampoline_pgtable) {
memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE);
native_write_cr3((unsigned long)top_pgtable);
memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
native_write_cr3((unsigned long)pgtable);
}
/* Restore trampoline memory */

View file

@ -193,7 +193,7 @@ static inline int init_new_context(struct task_struct *tsk,
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */
/* pkey 0 is the default and allocated implicitly */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;

View file

@ -2,6 +2,8 @@
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H
#define ARCH_DEFAULT_PKEY 0
#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
@ -15,7 +17,7 @@ extern int __execute_only_pkey(struct mm_struct *mm);
static inline int execute_only_pkey(struct mm_struct *mm)
{
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return 0;
return ARCH_DEFAULT_PKEY;
return __execute_only_pkey(mm);
}
@ -49,13 +51,21 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
{
/*
* "Allocated" pkeys are those that have been returned
* from pkey_alloc(). pkey 0 is special, and never
* returned from pkey_alloc().
* from pkey_alloc() or pkey 0 which is allocated
* implicitly when the mm is created.
*/
if (pkey <= 0)
if (pkey < 0)
return false;
if (pkey >= arch_max_pkey())
return false;
/*
* The exec-only pkey is set in the allocation map, but
* is not available to any of the user interfaces like
* mprotect_pkey().
*/
if (pkey == mm->context.execute_only_pkey)
return false;
return mm_pkey_allocation_map(mm) & (1U << pkey);
}

View file

@ -116,6 +116,7 @@ static void init_x2apic_ldr(void)
goto update;
}
cmsk = cluster_hotplug_mask;
cmsk->clusterid = cluster;
cluster_hotplug_mask = NULL;
update:
this_cpu_write(cluster_masks, cmsk);

View file

@ -104,6 +104,12 @@ static bool __head check_la57_support(unsigned long physaddr)
}
#endif
/* Code in __startup_64() can be relocated during execution, but the compiler
* doesn't have to generate PC-relative relocations when accessing globals from
* that function. Clang actually does not generate them, which leads to
* boot-time crashes. To work around this problem, every global pointer must
* be adjusted using fixup_pointer().
*/
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
@ -113,6 +119,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
p4dval_t *p4d;
pudval_t *pud;
pmdval_t *pmd, pmd_entry;
pteval_t *mask_ptr;
bool la57;
int i;
unsigned int *next_pgt_ptr;
@ -196,7 +203,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
/* Filter out unsupported __PAGE_KERNEL_* bits: */
pmd_entry &= __supported_pte_mask;
mask_ptr = fixup_pointer(&__supported_pte_mask, physaddr);
pmd_entry &= *mask_ptr;
pmd_entry += sme_get_me_mask();
pmd_entry += physaddr;

View file

@ -542,6 +542,7 @@ void set_personality_64bit(void)
clear_thread_flag(TIF_X32);
/* Pretend that this comes from a 64bit execve */
task_pt_regs(current)->orig_ax = __NR_execve;
current_thread_info()->status &= ~TS_COMPAT;
/* Ensure the corresponding mm is not marked. */
if (current->mm)

View file

@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey
*/
if (pkey != -1)
return pkey;
/*
* Look for a protection-key-drive execute-only mapping
* which is now being given permissions that are not
* execute-only. Move it back to the default pkey.
*/
if (vma_is_pkey_exec_only(vma) &&
(prot & (PROT_READ|PROT_WRITE))) {
return 0;
}
/*
* The mapping is execute-only. Go try to get the
* execute-only protection key. If we fail to do that,
* fall through as if we do not have execute-only
* support.
* support in this mm.
*/
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(vma->vm_mm);
if (pkey > 0)
return pkey;
} else if (vma_is_pkey_exec_only(vma)) {
/*
* Protections are *not* PROT_EXEC, but the mapping
* is using the exec-only pkey. This mapping was
* PROT_EXEC and will no longer be. Move back to
* the default pkey.
*/
return ARCH_DEFAULT_PKEY;
}
/*
* This is a vanilla, non-pkey mprotect (or we failed to
* setup execute-only), inherit the pkey from the VMA we

View file

@ -11,7 +11,7 @@ CAN_BUILD_X86_64 := $(shell ./check_cc.sh $(CC) trivial_64bit_program.c)
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
protection_keys test_vdso test_vsyscall
protection_keys test_vdso test_vsyscall mov_ss_trap
TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer

View file

@ -0,0 +1,285 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* mov_ss_trap.c: Exercise the bizarre side effects of a watchpoint on MOV SS
*
* This does MOV SS from a watchpointed address followed by various
* types of kernel entries. A MOV SS that hits a watchpoint will queue
* up a #DB trap but will not actually deliver that trap. The trap
* will be delivered after the next instruction instead. The CPU's logic
* seems to be:
*
* - Any fault: drop the pending #DB trap.
* - INT $N, INT3, INTO, SYSCALL, SYSENTER: enter the kernel and then
* deliver #DB.
* - ICEBP: enter the kernel but do not deliver the watchpoint trap
* - breakpoint: only one #DB is delivered (phew!)
*
* There are plenty of ways for a kernel to handle this incorrectly. This
* test tries to exercise all the cases.
*
* This should mostly cover CVE-2018-1087 and CVE-2018-8897.
*/
#define _GNU_SOURCE
#include <stdlib.h>
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/user.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <errno.h>
#include <stddef.h>
#include <stdio.h>
#include <err.h>
#include <string.h>
#include <setjmp.h>
#include <sys/prctl.h>
#define X86_EFLAGS_RF (1UL << 16)
#if __x86_64__
# define REG_IP REG_RIP
#else
# define REG_IP REG_EIP
#endif
unsigned short ss;
extern unsigned char breakpoint_insn[];
sigjmp_buf jmpbuf;
static unsigned char altstack_data[SIGSTKSZ];
static void enable_watchpoint(void)
{
pid_t parent = getpid();
int status;
pid_t child = fork();
if (child < 0)
err(1, "fork");
if (child) {
if (waitpid(child, &status, 0) != child)
err(1, "waitpid for child");
} else {
unsigned long dr0, dr1, dr7;
dr0 = (unsigned long)&ss;
dr1 = (unsigned long)breakpoint_insn;
dr7 = ((1UL << 1) | /* G0 */
(3UL << 16) | /* RW0 = read or write */
(1UL << 18) | /* LEN0 = 2 bytes */
(1UL << 3)); /* G1, RW1 = insn */
if (ptrace(PTRACE_ATTACH, parent, NULL, NULL) != 0)
err(1, "PTRACE_ATTACH");
if (waitpid(parent, &status, 0) != parent)
err(1, "waitpid for child");
if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[0]), dr0) != 0)
err(1, "PTRACE_POKEUSER DR0");
if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[1]), dr1) != 0)
err(1, "PTRACE_POKEUSER DR1");
if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[7]), dr7) != 0)
err(1, "PTRACE_POKEUSER DR7");
printf("\tDR0 = %lx, DR1 = %lx, DR7 = %lx\n", dr0, dr1, dr7);
if (ptrace(PTRACE_DETACH, parent, NULL, NULL) != 0)
err(1, "PTRACE_DETACH");
exit(0);
}
}
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO | flags;
sigemptyset(&sa.sa_mask);
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
}
static char const * const signames[] = {
[SIGSEGV] = "SIGSEGV",
[SIGBUS] = "SIBGUS",
[SIGTRAP] = "SIGTRAP",
[SIGILL] = "SIGILL",
};
static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
{
ucontext_t *ctx = ctx_void;
printf("\tGot SIGTRAP with RIP=%lx, EFLAGS.RF=%d\n",
(unsigned long)ctx->uc_mcontext.gregs[REG_IP],
!!(ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_RF));
}
static void handle_and_return(int sig, siginfo_t *si, void *ctx_void)
{
ucontext_t *ctx = ctx_void;
printf("\tGot %s with RIP=%lx\n", signames[sig],
(unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
}
static void handle_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
{
ucontext_t *ctx = ctx_void;
printf("\tGot %s with RIP=%lx\n", signames[sig],
(unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
siglongjmp(jmpbuf, 1);
}
int main()
{
unsigned long nr;
asm volatile ("mov %%ss, %[ss]" : [ss] "=m" (ss));
printf("\tSS = 0x%hx, &SS = 0x%p\n", ss, &ss);
if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0) == 0)
printf("\tPR_SET_PTRACER_ANY succeeded\n");
printf("\tSet up a watchpoint\n");
sethandler(SIGTRAP, sigtrap, 0);
enable_watchpoint();
printf("[RUN]\tRead from watched memory (should get SIGTRAP)\n");
asm volatile ("mov %[ss], %[tmp]" : [tmp] "=r" (nr) : [ss] "m" (ss));
printf("[RUN]\tMOV SS; INT3\n");
asm volatile ("mov %[ss], %%ss; int3" :: [ss] "m" (ss));
printf("[RUN]\tMOV SS; INT 3\n");
asm volatile ("mov %[ss], %%ss; .byte 0xcd, 0x3" :: [ss] "m" (ss));
printf("[RUN]\tMOV SS; CS CS INT3\n");
asm volatile ("mov %[ss], %%ss; .byte 0x2e, 0x2e; int3" :: [ss] "m" (ss));
printf("[RUN]\tMOV SS; CSx14 INT3\n");
asm volatile ("mov %[ss], %%ss; .fill 14,1,0x2e; int3" :: [ss] "m" (ss));
printf("[RUN]\tMOV SS; INT 4\n");
sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
asm volatile ("mov %[ss], %%ss; int $4" :: [ss] "m" (ss));
#ifdef __i386__
printf("[RUN]\tMOV SS; INTO\n");
sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
nr = -1;
asm volatile ("add $1, %[tmp]; mov %[ss], %%ss; into"
: [tmp] "+r" (nr) : [ss] "m" (ss));
#endif
if (sigsetjmp(jmpbuf, 1) == 0) {
printf("[RUN]\tMOV SS; ICEBP\n");
/* Some emulators (e.g. QEMU TCG) don't emulate ICEBP. */
sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
asm volatile ("mov %[ss], %%ss; .byte 0xf1" :: [ss] "m" (ss));
}
if (sigsetjmp(jmpbuf, 1) == 0) {
printf("[RUN]\tMOV SS; CLI\n");
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
asm volatile ("mov %[ss], %%ss; cli" :: [ss] "m" (ss));
}
if (sigsetjmp(jmpbuf, 1) == 0) {
printf("[RUN]\tMOV SS; #PF\n");
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
asm volatile ("mov %[ss], %%ss; mov (-1), %[tmp]"
: [tmp] "=r" (nr) : [ss] "m" (ss));
}
/*
* INT $1: if #DB has DPL=3 and there isn't special handling,
* then the kernel will die.
*/
if (sigsetjmp(jmpbuf, 1) == 0) {
printf("[RUN]\tMOV SS; INT 1\n");
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
asm volatile ("mov %[ss], %%ss; int $1" :: [ss] "m" (ss));
}
#ifdef __x86_64__
/*
* In principle, we should test 32-bit SYSCALL as well, but
* the calling convention is so unpredictable that it's
* not obviously worth the effort.
*/
if (sigsetjmp(jmpbuf, 1) == 0) {
printf("[RUN]\tMOV SS; SYSCALL\n");
sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
nr = SYS_getpid;
/*
* Toggle the high bit of RSP to make it noncanonical to
* strengthen this test on non-SMAP systems.
*/
asm volatile ("btc $63, %%rsp\n\t"
"mov %[ss], %%ss; syscall\n\t"
"btc $63, %%rsp"
: "+a" (nr) : [ss] "m" (ss)
: "rcx"
#ifdef __x86_64__
, "r11"
#endif
);
}
#endif
printf("[RUN]\tMOV SS; breakpointed NOP\n");
asm volatile ("mov %[ss], %%ss; breakpoint_insn: nop" :: [ss] "m" (ss));
/*
* Invoking SYSENTER directly breaks all the rules. Just handle
* the SIGSEGV.
*/
if (sigsetjmp(jmpbuf, 1) == 0) {
printf("[RUN]\tMOV SS; SYSENTER\n");
stack_t stack = {
.ss_sp = altstack_data,
.ss_size = SIGSTKSZ,
};
if (sigaltstack(&stack, NULL) != 0)
err(1, "sigaltstack");
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
nr = SYS_getpid;
asm volatile ("mov %[ss], %%ss; SYSENTER" : "+a" (nr)
: [ss] "m" (ss) : "flags", "rcx"
#ifdef __x86_64__
, "r11"
#endif
);
/* We're unreachable here. SYSENTER forgets RIP. */
}
if (sigsetjmp(jmpbuf, 1) == 0) {
printf("[RUN]\tMOV SS; INT $0x80\n");
sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
nr = 20; /* compat getpid */
asm volatile ("mov %[ss], %%ss; int $0x80"
: "+a" (nr) : [ss] "m" (ss)
: "flags"
#ifdef __x86_64__
, "r8", "r9", "r10", "r11"
#endif
);
}
printf("[OK]\tI aten't dead\n");
return 0;
}

View file

@ -368,6 +368,11 @@ static int expected_bnd_index = -1;
uint64_t shadow_plb[NR_MPX_BOUNDS_REGISTERS][2]; /* shadow MPX bound registers */
unsigned long shadow_map[NR_MPX_BOUNDS_REGISTERS];
/* Failed address bound checks: */
#ifndef SEGV_BNDERR
# define SEGV_BNDERR 3
#endif
/*
* The kernel is supposed to provide some information about the bounds
* exception in the siginfo. It should match what we have in the bounds
@ -419,8 +424,6 @@ void handler(int signum, siginfo_t *si, void *vucontext)
br_count++;
dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count);
#define SEGV_BNDERR 3 /* failed address bound checks */
dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n",
status, ip, br_reason);
dprintf2("si_signo: %d\n", si->si_signo);

View file

@ -26,30 +26,26 @@ static inline void sigsafe_printf(const char *format, ...)
{
va_list ap;
va_start(ap, format);
if (!dprint_in_signal) {
va_start(ap, format);
vprintf(format, ap);
va_end(ap);
} else {
int ret;
int len = vsnprintf(dprint_in_signal_buffer,
DPRINT_IN_SIGNAL_BUF_SIZE,
format, ap);
/*
* len is amount that would have been printed,
* but actual write is truncated at BUF_SIZE.
* No printf() functions are signal-safe.
* They deadlock easily. Write the format
* string to get some output, even if
* incomplete.
*/
if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
len = DPRINT_IN_SIGNAL_BUF_SIZE;
ret = write(1, dprint_in_signal_buffer, len);
ret = write(1, format, strlen(format));
if (ret < 0)
abort();
exit(1);
}
va_end(ap);
}
#define dprintf_level(level, args...) do { \
if (level <= DEBUG_LEVEL) \
sigsafe_printf(args); \
fflush(NULL); \
} while (0)
#define dprintf0(args...) dprintf_level(0, args)
#define dprintf1(args...) dprintf_level(1, args)

View file

@ -72,10 +72,9 @@ extern void abort_hooks(void);
test_nr, iteration_nr); \
dprintf0("errno at assert: %d", errno); \
abort_hooks(); \
assert(condition); \
exit(__LINE__); \
} \
} while (0)
#define raw_assert(cond) assert(cond)
void cat_into_file(char *str, char *file)
{
@ -87,12 +86,17 @@ void cat_into_file(char *str, char *file)
* these need to be raw because they are called under
* pkey_assert()
*/
raw_assert(fd >= 0);
if (fd < 0) {
fprintf(stderr, "error opening '%s'\n", str);
perror("error: ");
exit(__LINE__);
}
ret = write(fd, str, strlen(str));
if (ret != strlen(str)) {
perror("write to file failed");
fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
raw_assert(0);
exit(__LINE__);
}
close(fd);
}
@ -191,26 +195,30 @@ void lots_o_noops_around_write(int *write_to_me)
#ifdef __i386__
#ifndef SYS_mprotect_key
# define SYS_mprotect_key 380
# define SYS_mprotect_key 380
#endif
#ifndef SYS_pkey_alloc
# define SYS_pkey_alloc 381
# define SYS_pkey_free 382
# define SYS_pkey_alloc 381
# define SYS_pkey_free 382
#endif
#define REG_IP_IDX REG_EIP
#define si_pkey_offset 0x14
#define REG_IP_IDX REG_EIP
#define si_pkey_offset 0x14
#else
#ifndef SYS_mprotect_key
# define SYS_mprotect_key 329
# define SYS_mprotect_key 329
#endif
#ifndef SYS_pkey_alloc
# define SYS_pkey_alloc 330
# define SYS_pkey_free 331
# define SYS_pkey_alloc 330
# define SYS_pkey_free 331
#endif
#define REG_IP_IDX REG_RIP
#define si_pkey_offset 0x20
#define REG_IP_IDX REG_RIP
#define si_pkey_offset 0x20
#endif
@ -225,8 +233,14 @@ void dump_mem(void *dumpme, int len_bytes)
}
}
#define SEGV_BNDERR 3 /* failed address bound checks */
#define SEGV_PKUERR 4
/* Failed address bound checks: */
#ifndef SEGV_BNDERR
# define SEGV_BNDERR 3
#endif
#ifndef SEGV_PKUERR
# define SEGV_PKUERR 4
#endif
static char *si_code_str(int si_code)
{
@ -289,13 +303,6 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
dump_mem(pkru_ptr - 128, 256);
pkey_assert(*pkru_ptr);
si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
dump_mem(si_pkey_ptr - 8, 24);
siginfo_pkey = *si_pkey_ptr;
pkey_assert(siginfo_pkey < NR_PKEYS);
last_si_pkey = siginfo_pkey;
if ((si->si_code == SEGV_MAPERR) ||
(si->si_code == SEGV_ACCERR) ||
(si->si_code == SEGV_BNDERR)) {
@ -303,6 +310,13 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
exit(4);
}
si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
dump_mem((u8 *)si_pkey_ptr - 8, 24);
siginfo_pkey = *si_pkey_ptr;
pkey_assert(siginfo_pkey < NR_PKEYS);
last_si_pkey = siginfo_pkey;
dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
/* need __rdpkru() version so we do not do shadow_pkru checking */
dprintf1("signal pkru from pkru: %08x\n", __rdpkru());
@ -311,22 +325,6 @@ void signal_handler(int signum, siginfo_t *si, void *vucontext)
dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
pkru_faults++;
dprintf1("<<<<==================================================\n");
return;
if (trapno == 14) {
fprintf(stderr,
"ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n",
trapno, ip);
fprintf(stderr, "si_addr %p\n", si->si_addr);
fprintf(stderr, "REG_ERR: %lx\n",
(unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
exit(1);
} else {
fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip);
fprintf(stderr, "si_addr %p\n", si->si_addr);
fprintf(stderr, "REG_ERR: %lx\n",
(unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]);
exit(2);
}
dprint_in_signal = 0;
}
@ -393,10 +391,15 @@ pid_t fork_lazy_child(void)
return forkret;
}
#define PKEY_DISABLE_ACCESS 0x1
#define PKEY_DISABLE_WRITE 0x2
#ifndef PKEY_DISABLE_ACCESS
# define PKEY_DISABLE_ACCESS 0x1
#endif
u32 pkey_get(int pkey, unsigned long flags)
#ifndef PKEY_DISABLE_WRITE
# define PKEY_DISABLE_WRITE 0x2
#endif
static u32 hw_pkey_get(int pkey, unsigned long flags)
{
u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
u32 pkru = __rdpkru();
@ -418,7 +421,7 @@ u32 pkey_get(int pkey, unsigned long flags)
return masked_pkru;
}
int pkey_set(int pkey, unsigned long rights, unsigned long flags)
static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
{
u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
u32 old_pkru = __rdpkru();
@ -452,15 +455,15 @@ void pkey_disable_set(int pkey, int flags)
pkey, flags);
pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
pkey_rights = pkey_get(pkey, syscall_flags);
pkey_rights = hw_pkey_get(pkey, syscall_flags);
dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
pkey, pkey, pkey_rights);
pkey_assert(pkey_rights >= 0);
pkey_rights |= flags;
ret = pkey_set(pkey, pkey_rights, syscall_flags);
ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
assert(!ret);
/*pkru and flags have the same format */
shadow_pkru |= flags << (pkey * 2);
@ -468,8 +471,8 @@ void pkey_disable_set(int pkey, int flags)
pkey_assert(ret >= 0);
pkey_rights = pkey_get(pkey, syscall_flags);
dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
pkey_rights = hw_pkey_get(pkey, syscall_flags);
dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
pkey, pkey, pkey_rights);
dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
@ -483,24 +486,24 @@ void pkey_disable_clear(int pkey, int flags)
{
unsigned long syscall_flags = 0;
int ret;
int pkey_rights = pkey_get(pkey, syscall_flags);
int pkey_rights = hw_pkey_get(pkey, syscall_flags);
u32 orig_pkru = rdpkru();
pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
pkey, pkey, pkey_rights);
pkey_assert(pkey_rights >= 0);
pkey_rights |= flags;
ret = pkey_set(pkey, pkey_rights, 0);
ret = hw_pkey_set(pkey, pkey_rights, 0);
/* pkru and flags have the same format */
shadow_pkru &= ~(flags << (pkey * 2));
pkey_assert(ret >= 0);
pkey_rights = pkey_get(pkey, syscall_flags);
dprintf1("%s(%d) pkey_get(%d): %x\n", __func__,
pkey_rights = hw_pkey_get(pkey, syscall_flags);
dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
pkey, pkey, pkey_rights);
dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
@ -674,10 +677,12 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
struct pkey_malloc_record {
void *ptr;
long size;
int prot;
};
struct pkey_malloc_record *pkey_malloc_records;
struct pkey_malloc_record *pkey_last_malloc_record;
long nr_pkey_malloc_records;
void record_pkey_malloc(void *ptr, long size)
void record_pkey_malloc(void *ptr, long size, int prot)
{
long i;
struct pkey_malloc_record *rec = NULL;
@ -709,6 +714,8 @@ void record_pkey_malloc(void *ptr, long size)
(int)(rec - pkey_malloc_records), rec, ptr, size);
rec->ptr = ptr;
rec->size = size;
rec->prot = prot;
pkey_last_malloc_record = rec;
nr_pkey_malloc_records++;
}
@ -753,7 +760,7 @@ void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
pkey_assert(ptr != (void *)-1);
ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
pkey_assert(!ret);
record_pkey_malloc(ptr, size);
record_pkey_malloc(ptr, size, prot);
rdpkru();
dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
@ -774,7 +781,7 @@ void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
size = ALIGN_UP(size, HPAGE_SIZE * 2);
ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
pkey_assert(ptr != (void *)-1);
record_pkey_malloc(ptr, size);
record_pkey_malloc(ptr, size, prot);
mprotect_pkey(ptr, size, prot, pkey);
dprintf1("unaligned ptr: %p\n", ptr);
@ -847,7 +854,7 @@ void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
pkey_assert(ptr != (void *)-1);
mprotect_pkey(ptr, size, prot, pkey);
record_pkey_malloc(ptr, size);
record_pkey_malloc(ptr, size, prot);
dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
return ptr;
@ -869,7 +876,7 @@ void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
mprotect_pkey(ptr, size, prot, pkey);
record_pkey_malloc(ptr, size);
record_pkey_malloc(ptr, size, prot);
dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
close(fd);
@ -918,13 +925,21 @@ void *malloc_pkey(long size, int prot, u16 pkey)
}
int last_pkru_faults;
#define UNKNOWN_PKEY -2
void expected_pk_fault(int pkey)
{
dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
__func__, last_pkru_faults, pkru_faults);
dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
pkey_assert(last_pkru_faults + 1 == pkru_faults);
pkey_assert(last_si_pkey == pkey);
/*
* For exec-only memory, we do not know the pkey in
* advance, so skip this check.
*/
if (pkey != UNKNOWN_PKEY)
pkey_assert(last_si_pkey == pkey);
/*
* The signal handler shold have cleared out PKRU to let the
* test program continue. We now have to restore it.
@ -939,10 +954,11 @@ void expected_pk_fault(int pkey)
last_si_pkey = -1;
}
void do_not_expect_pk_fault(void)
{
pkey_assert(last_pkru_faults == pkru_faults);
}
#define do_not_expect_pk_fault(msg) do { \
if (last_pkru_faults != pkru_faults) \
dprintf0("unexpected PK fault: %s\n", msg); \
pkey_assert(last_pkru_faults == pkru_faults); \
} while (0)
int test_fds[10] = { -1 };
int nr_test_fds;
@ -1151,12 +1167,15 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
pkey_assert(i < NR_PKEYS*2);
/*
* There are 16 pkeys supported in hardware. One is taken
* up for the default (0) and another can be taken up by
* an execute-only mapping. Ensure that we can allocate
* at least 14 (16-2).
* There are 16 pkeys supported in hardware. Three are
* allocated by the time we get here:
* 1. The default key (0)
* 2. One possibly consumed by an execute-only mapping.
* 3. One allocated by the test code and passed in via
* 'pkey' to this function.
* Ensure that we can allocate at least another 13 (16-3).
*/
pkey_assert(i >= NR_PKEYS-2);
pkey_assert(i >= NR_PKEYS-3);
for (i = 0; i < nr_allocated_pkeys; i++) {
err = sys_pkey_free(allocated_pkeys[i]);
@ -1165,6 +1184,35 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
}
}
/*
* pkey 0 is special. It is allocated by default, so you do not
* have to call pkey_alloc() to use it first. Make sure that it
* is usable.
*/
void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
{
long size;
int prot;
assert(pkey_last_malloc_record);
size = pkey_last_malloc_record->size;
/*
* This is a bit of a hack. But mprotect() requires
* huge-page-aligned sizes when operating on hugetlbfs.
* So, make sure that we use something that's a multiple
* of a huge page when we can.
*/
if (size >= HPAGE_SIZE)
size = HPAGE_SIZE;
prot = pkey_last_malloc_record->prot;
/* Use pkey 0 */
mprotect_pkey(ptr, size, prot, 0);
/* Make sure that we can set it back to the original pkey. */
mprotect_pkey(ptr, size, prot, pkey);
}
void test_ptrace_of_child(int *ptr, u16 pkey)
{
__attribute__((__unused__)) int peek_result;
@ -1228,7 +1276,7 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
pkey_assert(ret != -1);
/* Now access from the current task, and expect NO exception: */
peek_result = read_ptr(plain_ptr);
do_not_expect_pk_fault();
do_not_expect_pk_fault("read plain pointer after ptrace");
ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
pkey_assert(ret != -1);
@ -1241,12 +1289,9 @@ void test_ptrace_of_child(int *ptr, u16 pkey)
free(plain_ptr_unaligned);
}
void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
void *get_pointer_to_instructions(void)
{
void *p1;
int scratch;
int ptr_contents;
int ret;
p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
@ -1256,7 +1301,23 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
/* Point 'p1' at the *second* page of the function: */
p1 += PAGE_SIZE;
/*
* Try to ensure we fault this in on next touch to ensure
* we get an instruction fault as opposed to a data one
*/
madvise(p1, PAGE_SIZE, MADV_DONTNEED);
return p1;
}
void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
{
void *p1;
int scratch;
int ptr_contents;
int ret;
p1 = get_pointer_to_instructions();
lots_o_noops_around_write(&scratch);
ptr_contents = read_ptr(p1);
dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
@ -1272,12 +1333,55 @@ void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
*/
madvise(p1, PAGE_SIZE, MADV_DONTNEED);
lots_o_noops_around_write(&scratch);
do_not_expect_pk_fault();
do_not_expect_pk_fault("executing on PROT_EXEC memory");
ptr_contents = read_ptr(p1);
dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
expected_pk_fault(pkey);
}
void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
{
void *p1;
int scratch;
int ptr_contents;
int ret;
dprintf1("%s() start\n", __func__);
p1 = get_pointer_to_instructions();
lots_o_noops_around_write(&scratch);
ptr_contents = read_ptr(p1);
dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
/* Use a *normal* mprotect(), not mprotect_pkey(): */
ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
pkey_assert(!ret);
dprintf2("pkru: %x\n", rdpkru());
/* Make sure this is an *instruction* fault */
madvise(p1, PAGE_SIZE, MADV_DONTNEED);
lots_o_noops_around_write(&scratch);
do_not_expect_pk_fault("executing on PROT_EXEC memory");
ptr_contents = read_ptr(p1);
dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
expected_pk_fault(UNKNOWN_PKEY);
/*
* Put the memory back to non-PROT_EXEC. Should clear the
* exec-only pkey off the VMA and allow it to be readable
* again. Go to PROT_NONE first to check for a kernel bug
* that did not clear the pkey when doing PROT_NONE.
*/
ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
pkey_assert(!ret);
ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
pkey_assert(!ret);
ptr_contents = read_ptr(p1);
do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
}
void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
{
int size = PAGE_SIZE;
@ -1302,6 +1406,8 @@ void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_kernel_gup_of_access_disabled_region,
test_kernel_gup_write_to_write_disabled_region,
test_executing_on_unreadable_memory,
test_implicit_mprotect_exec_only_memory,
test_mprotect_with_pkey_0,
test_ptrace_of_child,
test_pkey_syscalls_on_non_allocated_pkey,
test_pkey_syscalls_bad_args,