diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b9b0623be925..5c7a0f5b0a2f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5100,13 +5100,12 @@ targets for exploits that can control RIP. emulate [default] Vsyscalls turn into traps and are - emulated reasonably safely. + emulated reasonably safely. The vsyscall + page is readable. - native Vsyscalls are native syscall instructions. - This is a little bit faster than trapping - and makes a few dynamic recompilers work - better than they would in emulation mode. - It also makes exploits much easier to write. + xonly Vsyscalls turn into traps and are + emulated reasonably safely. The vsyscall + page is not readable. none Vsyscalls don't work at all. This makes them quite hard to use for exploits but diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 51a98d6eae8e..fbabf59692ff 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2288,7 +2288,7 @@ config COMPAT_VDSO choice prompt "vsyscall table for legacy applications" depends on X86_64 - default LEGACY_VSYSCALL_EMULATE + default LEGACY_VSYSCALL_XONLY help Legacy user code that does not know how to find the vDSO expects to be able to issue three syscalls by calling fixed addresses in @@ -2296,23 +2296,38 @@ choice it can be used to assist security vulnerability exploitation. This setting can be changed at boot time via the kernel command - line parameter vsyscall=[emulate|none]. + line parameter vsyscall=[emulate|xonly|none]. On a system with recent enough glibc (2.14 or newer) and no static binaries, you can say None without a performance penalty to improve security. - If unsure, select "Emulate". + If unsure, select "Emulate execution only". config LEGACY_VSYSCALL_EMULATE - bool "Emulate" + bool "Full emulation" help - The kernel traps and emulates calls into the fixed - vsyscall address mapping. This makes the mapping - non-executable, but it still contains known contents, - which could be used in certain rare security vulnerability - exploits. This configuration is recommended when userspace - still uses the vsyscall area. + The kernel traps and emulates calls into the fixed vsyscall + address mapping. This makes the mapping non-executable, but + it still contains readable known contents, which could be + used in certain rare security vulnerability exploits. This + configuration is recommended when using legacy userspace + that still uses vsyscalls along with legacy binary + instrumentation tools that require code to be readable. + + An example of this type of legacy userspace is running + Pin on an old binary that still uses vsyscalls. + + config LEGACY_VSYSCALL_XONLY + bool "Emulate execution only" + help + The kernel traps and emulates calls into the fixed vsyscall + address mapping and does not allow reads. This + configuration is recommended when userspace might use the + legacy vsyscall area but support for legacy binary + instrumentation of legacy code is not needed. It mitigates + certain uses of the vsyscall area as an ASLR-bypassing + buffer. config LEGACY_VSYSCALL_NONE bool "None" diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index d9d81ad7a400..07003f3f1bfc 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -42,9 +42,11 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NONE } vsyscall_mode = +static enum { EMULATE, XONLY, NONE } vsyscall_mode __ro_after_init = #ifdef CONFIG_LEGACY_VSYSCALL_NONE NONE; +#elif defined(CONFIG_LEGACY_VSYSCALL_XONLY) + XONLY; #else EMULATE; #endif @@ -54,6 +56,8 @@ static int __init vsyscall_setup(char *str) if (str) { if (!strcmp("emulate", str)) vsyscall_mode = EMULATE; + else if (!strcmp("xonly", str)) + vsyscall_mode = XONLY; else if (!strcmp("none", str)) vsyscall_mode = NONE; else @@ -113,7 +117,8 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) } } -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; @@ -122,6 +127,22 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) long ret; unsigned long orig_dx; + /* Write faults or kernel-privilege faults never get fixed up. */ + if ((error_code & (X86_PF_WRITE | X86_PF_USER)) != X86_PF_USER) + return false; + + if (!(error_code & X86_PF_INSTR)) { + /* Failed vsyscall read */ + if (vsyscall_mode == EMULATE) + return false; + + /* + * User code tried and failed to read the vsyscall page. + */ + warn_bad_vsyscall(KERN_INFO, regs, "vsyscall read attempt denied -- look up the vsyscall kernel parameter if you need a workaround"); + return false; + } + /* * No point in checking CS -- the only way to get here is a user mode * trap to a high address, which means that we're in 64-bit user code. @@ -284,7 +305,7 @@ static const char *gate_vma_name(struct vm_area_struct *vma) static const struct vm_operations_struct gate_vma_ops = { .name = gate_vma_name, }; -static struct vm_area_struct gate_vma = { +static struct vm_area_struct gate_vma __ro_after_init = { .vm_start = VSYSCALL_ADDR, .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, @@ -357,12 +378,20 @@ void __init map_vsyscall(void) extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - if (vsyscall_mode != NONE) { + /* + * For full emulation, the page needs to exist for real. In + * execute-only mode, there is no PTE at all backing the vsyscall + * page. + */ + if (vsyscall_mode == EMULATE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, PAGE_KERNEL_VVAR); set_vsyscall_pgtable_user_bits(swapper_pg_dir); } + if (vsyscall_mode == XONLY) + gate_vma.vm_flags = VM_EXEC; + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); } diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index b986b2ca688a..ab60a71a8dcb 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -13,10 +13,12 @@ extern void set_vsyscall_pgtable_user_bits(pgd_t *root); * Called on instruction fetch fault in vsyscall page. * Returns true if handled. */ -extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); +extern bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address); #else static inline void map_vsyscall(void) {} -static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +static inline bool emulate_vsyscall(unsigned long error_code, + struct pt_regs *regs, unsigned long address) { return false; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 46df4c6aae46..58e4f1f00bbc 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -710,6 +710,10 @@ static void set_signal_archinfo(unsigned long address, * To avoid leaking information about the kernel page * table layout, pretend that user-mode accesses to * kernel addresses are always protection faults. + * + * NB: This means that failed vsyscalls with vsyscall=none + * will have the PROT bit. This doesn't leak any + * information and does not appear to cause any problems. */ if (address >= TASK_SIZE_MAX) error_code |= X86_PF_PROT; @@ -1369,16 +1373,18 @@ void do_user_addr_fault(struct pt_regs *regs, #ifdef CONFIG_X86_64 /* - * Instruction fetch faults in the vsyscall page might need - * emulation. The vsyscall page is at a high address - * (>PAGE_OFFSET), but is considered to be part of the user - * address space. + * Faults in the vsyscall page might need emulation. The + * vsyscall page is at a high address (>PAGE_OFFSET), but is + * considered to be part of the user address space. * * The vsyscall page does not have a "real" VMA, so do this * emulation before we go searching for VMAs. + * + * PKRU never rejects instruction fetches, so we don't need + * to consider the PF_PK bit. */ - if ((hw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { - if (emulate_vsyscall(regs, address)) + if (is_vsyscall_vaddr(address)) { + if (emulate_vsyscall(hw_error_code, regs, address)) return; } #endif diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c index 0b4f1cc2291c..4602326b8f5b 100644 --- a/tools/testing/selftests/x86/test_vsyscall.c +++ b/tools/testing/selftests/x86/test_vsyscall.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef __x86_64__ # define VSYS(x) (x) @@ -49,21 +50,21 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), } /* vsyscalls and vDSO */ -bool should_read_vsyscall = false; +bool vsyscall_map_r = false, vsyscall_map_x = false; typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz); -gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000); +const gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000); gtod_t vdso_gtod; typedef int (*vgettime_t)(clockid_t, struct timespec *); vgettime_t vdso_gettime; typedef long (*time_func_t)(time_t *t); -time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400); +const time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400); time_func_t vdso_time; typedef long (*getcpu_t)(unsigned *, unsigned *, void *); -getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800); +const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800); getcpu_t vdso_getcpu; static void init_vdso(void) @@ -107,7 +108,7 @@ static int init_vsys(void) maps = fopen("/proc/self/maps", "r"); if (!maps) { printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n"); - should_read_vsyscall = true; + vsyscall_map_r = true; return 0; } @@ -133,12 +134,8 @@ static int init_vsys(void) } printf("\tvsyscall permissions are %c-%c\n", r, x); - should_read_vsyscall = (r == 'r'); - if (x != 'x') { - vgtod = NULL; - vtime = NULL; - vgetcpu = NULL; - } + vsyscall_map_r = (r == 'r'); + vsyscall_map_x = (x == 'x'); found = true; break; @@ -148,10 +145,8 @@ static int init_vsys(void) if (!found) { printf("\tno vsyscall map in /proc/self/maps\n"); - should_read_vsyscall = false; - vgtod = NULL; - vtime = NULL; - vgetcpu = NULL; + vsyscall_map_r = false; + vsyscall_map_x = false; } return nerrs; @@ -183,9 +178,13 @@ static inline long sys_getcpu(unsigned * cpu, unsigned * node, } static jmp_buf jmpbuf; +static volatile unsigned long segv_err; static void sigsegv(int sig, siginfo_t *info, void *ctx_void) { + ucontext_t *ctx = (ucontext_t *)ctx_void; + + segv_err = ctx->uc_mcontext.gregs[REG_ERR]; siglongjmp(jmpbuf, 1); } @@ -238,7 +237,7 @@ static int test_gtod(void) err(1, "syscall gettimeofday"); if (vdso_gtod) ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso); - if (vgtod) + if (vsyscall_map_x) ret_vsys = vgtod(&tv_vsys, &tz_vsys); if (sys_gtod(&tv_sys2, &tz_sys) != 0) err(1, "syscall gettimeofday"); @@ -252,7 +251,7 @@ static int test_gtod(void) } } - if (vgtod) { + if (vsyscall_map_x) { if (ret_vsys == 0) { nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys); } else { @@ -273,7 +272,7 @@ static int test_time(void) { t_sys1 = sys_time(&t2_sys1); if (vdso_time) t_vdso = vdso_time(&t2_vdso); - if (vtime) + if (vsyscall_map_x) t_vsys = vtime(&t2_vsys); t_sys2 = sys_time(&t2_sys2); if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) { @@ -294,7 +293,7 @@ static int test_time(void) { } } - if (vtime) { + if (vsyscall_map_x) { if (t_vsys < 0 || t_vsys != t2_vsys) { printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys); nerrs++; @@ -330,7 +329,7 @@ static int test_getcpu(int cpu) ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0); if (vdso_getcpu) ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0); - if (vgetcpu) + if (vsyscall_map_x) ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0); if (ret_sys == 0) { @@ -369,7 +368,7 @@ static int test_getcpu(int cpu) } } - if (vgetcpu) { + if (vsyscall_map_x) { if (ret_vsys) { printf("[FAIL]\tvsyscall getcpu() failed\n"); nerrs++; @@ -410,20 +409,88 @@ static int test_vsys_r(void) can_read = false; } - if (can_read && !should_read_vsyscall) { + if (can_read && !vsyscall_map_r) { printf("[FAIL]\tWe have read access, but we shouldn't\n"); return 1; - } else if (!can_read && should_read_vsyscall) { + } else if (!can_read && vsyscall_map_r) { printf("[FAIL]\tWe don't have read access, but we should\n"); return 1; + } else if (can_read) { + printf("[OK]\tWe have read access\n"); } else { - printf("[OK]\tgot expected result\n"); + printf("[OK]\tWe do not have read access: #PF(0x%lx)\n", + segv_err); } #endif return 0; } +static int test_vsys_x(void) +{ +#ifdef __x86_64__ + if (vsyscall_map_x) { + /* We already tested this adequately. */ + return 0; + } + + printf("[RUN]\tMake sure that vsyscalls really page fault\n"); + + bool can_exec; + if (sigsetjmp(jmpbuf, 1) == 0) { + vgtod(NULL, NULL); + can_exec = true; + } else { + can_exec = false; + } + + if (can_exec) { + printf("[FAIL]\tExecuting the vsyscall did not page fault\n"); + return 1; + } else if (segv_err & (1 << 4)) { /* INSTR */ + printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n", + segv_err); + } else { + printf("[FAILT]\tExecution failed with the wrong error: #PF(0x%lx)\n", + segv_err); + return 1; + } +#endif + + return 0; +} + +static int test_process_vm_readv(void) +{ +#ifdef __x86_64__ + char buf[4096]; + struct iovec local, remote; + int ret; + + printf("[RUN]\tprocess_vm_readv() from vsyscall page\n"); + + local.iov_base = buf; + local.iov_len = 4096; + remote.iov_base = (void *)0xffffffffff600000; + remote.iov_len = 4096; + ret = process_vm_readv(getpid(), &local, 1, &remote, 1, 0); + if (ret != 4096) { + printf("[OK]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", ret, errno); + return 0; + } + + if (vsyscall_map_r) { + if (!memcmp(buf, (const void *)0xffffffffff600000, 4096)) { + printf("[OK]\tIt worked and read correct data\n"); + } else { + printf("[FAIL]\tIt worked but returned incorrect data\n"); + return 1; + } + } +#endif + + return 0; +} #ifdef __x86_64__ #define X86_EFLAGS_TF (1UL << 8) @@ -455,7 +522,7 @@ static int test_emulation(void) time_t tmp; bool is_native; - if (!vtime) + if (!vsyscall_map_x) return 0; printf("[RUN]\tchecking that vsyscalls are emulated\n"); @@ -497,6 +564,9 @@ int main(int argc, char **argv) sethandler(SIGSEGV, sigsegv, 0); nerrs += test_vsys_r(); + nerrs += test_vsys_x(); + + nerrs += test_process_vm_readv(); #ifdef __x86_64__ nerrs += test_emulation();