diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 107acc403b3b..4f30ad647ec5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1519,6 +1519,7 @@ config AMD_MEM_ENCRYPT select ARCH_USE_MEMREMAP_PROT select ARCH_HAS_FORCE_DMA_UNENCRYPTED select INSTRUCTION_DECODER + select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS help Say yes to enable support for the encryption of system memory. This requires an AMD processor that supports Secure Memory diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index e94874f4bbc1..91ea0d550ada 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -34,6 +34,7 @@ #include #include #include +#include #include "pgtable.h" /* @@ -107,9 +108,19 @@ SYM_FUNC_START(startup_32) movl %eax, %gs movl %eax, %ss -/* setup a stack and make sure cpu supports long mode. */ + /* Setup a stack and load CS from current GDT */ leal rva(boot_stack_end)(%ebp), %esp + pushl $__KERNEL32_CS + leal rva(1f)(%ebp), %eax + pushl %eax + lretl +1: + + /* Setup Exception handling for SEV-ES */ + call startup32_load_idt + + /* Make sure cpu supports long mode. */ call verify_cpu testl %eax, %eax jnz .Lno_longmode @@ -172,11 +183,21 @@ SYM_FUNC_START(startup_32) */ call get_sev_encryption_bit xorl %edx, %edx +#ifdef CONFIG_AMD_MEM_ENCRYPT testl %eax, %eax jz 1f subl $32, %eax /* Encryption bit is always above bit 31 */ bts %eax, %edx /* Set encryption mask for page tables */ + /* + * Mark SEV as active in sev_status so that startup32_check_sev_cbit() + * will do a check. The sev_status memory will be fully initialized + * with the contents of MSR_AMD_SEV_STATUS later in + * set_sev_encryption_mask(). For now it is sufficient to know that SEV + * is active. + */ + movl $1, rva(sev_status)(%ebp) 1: +#endif /* Initialize Page tables to 0 */ leal rva(pgtable)(%ebx), %edi @@ -261,6 +282,9 @@ SYM_FUNC_START(startup_32) movl %esi, %edx 1: #endif + /* Check if the C-bit position is correct when SEV is active */ + call startup32_check_sev_cbit + pushl $__KERNEL_CS pushl %eax @@ -694,6 +718,19 @@ SYM_DATA_START(boot_idt) .endr SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end) +#ifdef CONFIG_AMD_MEM_ENCRYPT +SYM_DATA_START(boot32_idt_desc) + .word boot32_idt_end - boot32_idt - 1 + .long 0 +SYM_DATA_END(boot32_idt_desc) + .balign 8 +SYM_DATA_START(boot32_idt) + .rept 32 + .quad 0 + .endr +SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end) +#endif + #ifdef CONFIG_EFI_STUB SYM_DATA(image_offset, .long 0) #endif @@ -786,6 +823,137 @@ SYM_DATA_START_LOCAL(loaded_image_proto) SYM_DATA_END(loaded_image_proto) #endif +#ifdef CONFIG_AMD_MEM_ENCRYPT + __HEAD + .code32 +/* + * Write an IDT entry into boot32_idt + * + * Parameters: + * + * %eax: Handler address + * %edx: Vector number + * + * Physical offset is expected in %ebp + */ +SYM_FUNC_START(startup32_set_idt_entry) + push %ebx + push %ecx + + /* IDT entry address to %ebx */ + leal rva(boot32_idt)(%ebp), %ebx + shl $3, %edx + addl %edx, %ebx + + /* Build IDT entry, lower 4 bytes */ + movl %eax, %edx + andl $0x0000ffff, %edx # Target code segment offset [15:0] + movl $__KERNEL32_CS, %ecx # Target code segment selector + shl $16, %ecx + orl %ecx, %edx + + /* Store lower 4 bytes to IDT */ + movl %edx, (%ebx) + + /* Build IDT entry, upper 4 bytes */ + movl %eax, %edx + andl $0xffff0000, %edx # Target code segment offset [31:16] + orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate + + /* Store upper 4 bytes to IDT */ + movl %edx, 4(%ebx) + + pop %ecx + pop %ebx + ret +SYM_FUNC_END(startup32_set_idt_entry) +#endif + +SYM_FUNC_START(startup32_load_idt) +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* #VC handler */ + leal rva(startup32_vc_handler)(%ebp), %eax + movl $X86_TRAP_VC, %edx + call startup32_set_idt_entry + + /* Load IDT */ + leal rva(boot32_idt)(%ebp), %eax + movl %eax, rva(boot32_idt_desc+2)(%ebp) + lidt rva(boot32_idt_desc)(%ebp) +#endif + ret +SYM_FUNC_END(startup32_load_idt) + +/* + * Check for the correct C-bit position when the startup_32 boot-path is used. + * + * The check makes use of the fact that all memory is encrypted when paging is + * disabled. The function creates 64 bits of random data using the RDRAND + * instruction. RDRAND is mandatory for SEV guests, so always available. If the + * hypervisor violates that the kernel will crash right here. + * + * The 64 bits of random data are stored to a memory location and at the same + * time kept in the %eax and %ebx registers. Since encryption is always active + * when paging is off the random data will be stored encrypted in main memory. + * + * Then paging is enabled. When the C-bit position is correct all memory is + * still mapped encrypted and comparing the register values with memory will + * succeed. An incorrect C-bit position will map all memory unencrypted, so that + * the compare will use the encrypted random data and fail. + */ +SYM_FUNC_START(startup32_check_sev_cbit) +#ifdef CONFIG_AMD_MEM_ENCRYPT + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Check for non-zero sev_status */ + movl rva(sev_status)(%ebp), %eax + testl %eax, %eax + jz 4f + + /* + * Get two 32-bit random values - Don't bail out if RDRAND fails + * because it is better to prevent forward progress if no random value + * can be gathered. + */ +1: rdrand %eax + jnc 1b +2: rdrand %ebx + jnc 2b + + /* Store to memory and keep it in the registers */ + movl %eax, rva(sev_check_data)(%ebp) + movl %ebx, rva(sev_check_data+4)(%ebp) + + /* Enable paging to see if encryption is active */ + movl %cr0, %edx /* Backup %cr0 in %edx */ + movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */ + movl %ecx, %cr0 + + cmpl %eax, rva(sev_check_data)(%ebp) + jne 3f + cmpl %ebx, rva(sev_check_data+4)(%ebp) + jne 3f + + movl %edx, %cr0 /* Restore previous %cr0 */ + + jmp 4f + +3: /* Check failed - hlt the machine */ + hlt + jmp 3b + +4: + popl %edx + popl %ecx + popl %ebx + popl %eax +#endif + ret +SYM_FUNC_END(startup32_check_sev_cbit) + /* * Stack and heap for uncompression */ diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 804a502ee0d2..9b93567d663a 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -52,3 +52,17 @@ void load_stage2_idt(void) load_boot_idt(&boot_idt_desc); } + +void cleanup_exception_handling(void) +{ + /* + * Flush GHCB from cache and map it encrypted again when running as + * SEV-ES guest. + */ + sev_es_shutdown_ghcb(); + + /* Set a null-idt, disabling #PF and #VC handling */ + boot_idt_desc.size = 0; + boot_idt_desc.address = 0; + load_boot_idt(&boot_idt_desc); +} diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S index aa561795efd1..c1e81a848b2a 100644 --- a/arch/x86/boot/compressed/mem_encrypt.S +++ b/arch/x86/boot/compressed/mem_encrypt.S @@ -23,12 +23,6 @@ SYM_FUNC_START(get_sev_encryption_bit) push %ecx push %edx - /* Check if running under a hypervisor */ - movl $1, %eax - cpuid - bt $31, %ecx /* Check the hypervisor bit */ - jnc .Lno_sev - movl $0x80000000, %eax /* CPUID to check the highest leaf */ cpuid cmpl $0x8000001f, %eax /* See if 0x8000001f is available */ @@ -67,10 +61,132 @@ SYM_FUNC_START(get_sev_encryption_bit) ret SYM_FUNC_END(get_sev_encryption_bit) +/** + * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using + * the GHCB MSR protocol + * + * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX) + * @%edx: CPUID Function + * + * Returns 0 in %eax on success, non-zero on failure + * %edx returns CPUID value on success + */ +SYM_CODE_START_LOCAL(sev_es_req_cpuid) + shll $30, %eax + orl $0x00000004, %eax + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall # VMGEXIT + rdmsr + + /* Check response */ + movl %eax, %ecx + andl $0x3ffff000, %ecx # Bits [12-29] MBZ + jnz 2f + + /* Check return code */ + andl $0xfff, %eax + cmpl $5, %eax + jne 2f + + /* All good - return success */ + xorl %eax, %eax +1: + ret +2: + movl $-1, %eax + jmp 1b +SYM_CODE_END(sev_es_req_cpuid) + +SYM_CODE_START(startup32_vc_handler) + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + /* Keep CPUID function in %ebx */ + movl %eax, %ebx + + /* Check if error-code == SVM_EXIT_CPUID */ + cmpl $0x72, 16(%esp) + jne .Lfail + + movl $0, %eax # Request CPUID[fn].EAX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 12(%esp) # Store result + + movl $1, %eax # Request CPUID[fn].EBX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 8(%esp) # Store result + + movl $2, %eax # Request CPUID[fn].ECX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 4(%esp) # Store result + + movl $3, %eax # Request CPUID[fn].EDX + movl %ebx, %edx # CPUID fn + call sev_es_req_cpuid # Call helper + testl %eax, %eax # Check return code + jnz .Lfail + movl %edx, 0(%esp) # Store result + + /* + * Sanity check CPUID results from the Hypervisor. See comment in + * do_vc_no_ghcb() for more details on why this is necessary. + */ + + /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */ + cmpl $0x80000000, %ebx + jne .Lcheck_sev + cmpl $0x8000001f, 12(%esp) + jb .Lfail + jmp .Ldone + +.Lcheck_sev: + /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */ + cmpl $0x8000001f, %ebx + jne .Ldone + btl $1, 12(%esp) + jnc .Lfail + +.Ldone: + popl %edx + popl %ecx + popl %ebx + popl %eax + + /* Remove error code */ + addl $4, %esp + + /* Jump over CPUID instruction */ + addl $2, (%esp) + + iret +.Lfail: + /* Send terminate request to Hypervisor */ + movl $0x100, %eax + xorl %edx, %edx + movl $MSR_AMD64_SEV_ES_GHCB, %ecx + wrmsr + rep; vmmcall + + /* If request fails, go to hlt loop */ + hlt + jmp .Lfail +SYM_CODE_END(startup32_vc_handler) + .code64 #include "../../kernel/sev_verify_cbit.S" - SYM_FUNC_START(set_sev_encryption_mask) #ifdef CONFIG_AMD_MEM_ENCRYPT push %rbp diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 267e7f93050e..cc9fd0e8766a 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -443,11 +443,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, handle_relocations(output, output_len, virt_addr); debug_putstr("done.\nBooting the kernel.\n"); - /* - * Flush GHCB from cache and map it encrypted again when running as - * SEV-ES guest. - */ - sev_es_shutdown_ghcb(); + /* Disable exception handling before booting the kernel */ + cleanup_exception_handling(); return output; } diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 901ea5ebec22..e5612f035498 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -155,6 +155,12 @@ extern pteval_t __default_kernel_pte_mask; extern gate_desc boot_idt[BOOT_IDT_ENTRIES]; extern struct desc_ptr boot_idt_desc; +#ifdef CONFIG_X86_64 +void cleanup_exception_handling(void); +#else +static inline void cleanup_exception_handling(void) { } +#endif + /* IDT Entry Points */ void boot_page_fault(void); void boot_stage1_vc(void); diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c index 27826c265aab..d904bd56b3e3 100644 --- a/arch/x86/boot/compressed/sev-es.c +++ b/arch/x86/boot/compressed/sev-es.c @@ -200,14 +200,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) } finish: - if (result == ES_OK) { + if (result == ES_OK) vc_finish_insn(&ctxt); - } else if (result != ES_RETRY) { - /* - * For now, just halt the machine. That makes debugging easier, - * later we just call sev_es_terminate() here. - */ - while (true) - asm volatile("hlt\n"); - } + else if (result != ES_RETRY) + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c index cdc04d091242..0aa9f13efd57 100644 --- a/arch/x86/kernel/sev-es-shared.c +++ b/arch/x86/kernel/sev-es-shared.c @@ -24,7 +24,7 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int reason) { u64 val = GHCB_SEV_TERMINATE; @@ -186,7 +186,6 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * make it accessible to the hypervisor. * * In particular, check for: - * - Hypervisor CPUID bit * - Availability of CPUID leaf 0x8000001f * - SEV CPUID bit. * @@ -194,10 +193,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) * can't be checked here. */ - if ((fn == 1 && !(regs->cx & BIT(31)))) - /* Hypervisor bit */ - goto fail; - else if (fn == 0x80000000 && (regs->ax < 0x8000001f)) + if (fn == 0x80000000 && (regs->ax < 0x8000001f)) /* SEV leaf check */ goto fail; else if ((fn == 0x8000001f && !(regs->ax & BIT(1)))) @@ -210,12 +206,8 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) return; fail: - sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE); - VMGEXIT(); - - /* Shouldn't get here - if we do halt the machine */ - while (true) - asm volatile("hlt\n"); + /* Terminate the guest */ + sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c index 04a780abb512..26f5479a97a8 100644 --- a/arch/x86/kernel/sev-es.c +++ b/arch/x86/kernel/sev-es.c @@ -137,29 +137,41 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs) } /* - * This function handles the case when an NMI is raised in the #VC exception - * handler entry code. In this case, the IST entry for #VC must be adjusted, so - * that any subsequent #VC exception will not overwrite the stack contents of the - * interrupted #VC handler. + * This function handles the case when an NMI is raised in the #VC + * exception handler entry code, before the #VC handler has switched off + * its IST stack. In this case, the IST entry for #VC must be adjusted, + * so that any nested #VC exception will not overwrite the stack + * contents of the interrupted #VC handler. * * The IST entry is adjusted unconditionally so that it can be also be - * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested - * sev_es_ist_exit() call may adjust back the IST entry too early. + * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a + * nested sev_es_ist_exit() call may adjust back the IST entry too + * early. + * + * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run + * on the NMI IST stack, as they are only called from NMI handling code + * right now. */ void noinstr __sev_es_ist_enter(struct pt_regs *regs) { unsigned long old_ist, new_ist; /* Read old IST entry */ - old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); + new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]); - /* Make room on the IST stack */ + /* + * If NMI happened while on the #VC IST stack, set the new IST + * value below regs->sp, so that the interrupted stack frame is + * not overwritten by subsequent #VC exceptions. + */ if (on_vc_stack(regs)) - new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist); - else - new_ist = old_ist - sizeof(old_ist); + new_ist = regs->sp; - /* Store old IST entry */ + /* + * Reserve additional 8 bytes and store old IST value so this + * adjustment can be unrolled in __sev_es_ist_exit(). + */ + new_ist -= sizeof(old_ist); *(unsigned long *)new_ist = old_ist; /* Set new IST entry */ @@ -277,7 +289,7 @@ static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt) return ES_EXCEPTION; } - insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1); + insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1); insn_get_length(&ctxt->insn); } diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index ae78cef79980..f633f9e23b8f 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -484,3 +485,8 @@ void __init mem_encrypt_init(void) print_mem_encrypt_feature_info(); } +int arch_has_restricted_virtio_memory_access(void) +{ + return sev_active(); +} +EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access); diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 6c5eb6f3f14f..a19374d26101 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -503,14 +503,10 @@ void __init sme_enable(struct boot_params *bp) #define AMD_SME_BIT BIT(0) #define AMD_SEV_BIT BIT(1) - /* - * Set the feature mask (SME or SEV) based on whether we are - * running under a hypervisor. - */ - eax = 1; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT; + + /* Check the SEV MSR whether SEV or SME is enabled */ + sev_status = __rdmsr(MSR_AMD64_SEV); + feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; /* * Check for the SME/SEV feature: @@ -530,19 +526,26 @@ void __init sme_enable(struct boot_params *bp) /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { + /* + * No SME if Hypervisor bit is set. This check is here to + * prevent a guest from trying to enable SME. For running as a + * KVM guest the MSR_K8_SYSCFG will be sufficient, but there + * might be other hypervisors which emulate that MSR as non-zero + * or even pass it through to the guest. + * A malicious hypervisor can still trick a guest into this + * path, but there is no way to protect against that. + */ + eax = 1; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (ecx & BIT(31)) + return; + /* For SME, check the SYSCFG MSR */ msr = __rdmsr(MSR_K8_SYSCFG); if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) return; } else { - /* For SEV, check the SEV MSR */ - msr = __rdmsr(MSR_AMD64_SEV); - if (!(msr & MSR_AMD64_SEV_ENABLED)) - return; - - /* Save SEV_STATUS to avoid reading MSR again */ - sev_status = msr; - /* SEV state cannot be controlled by a command line option */ sme_me_mask = me_mask; sev_enabled = true;