- Avoid the baremetal decompressor code when booting on an EFI machine.

This is mandated by the current tightening of EFI executables
   requirements when used in a secure boot scenario. More specifically,
   an EFI executable cannot have a single section with RWX permissions,
   which conflicts with the in-place kernel decompression that is done
   today. Instead, the things required by the booting kernel image are
   done in the EFI stub now. Work by Ard Biesheuvel.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmTsMGAACgkQEsHwGGHe
 VUqqGg/+LJq7dapnyMiqafO9Xdpm+h7wTylChdfev7UB3wri9IJGXXFkwZZUNF8Z
 p2c/ECk75CG+z7ZSv+WirZLWlmlBT1D51YjI1lHmg3Vmh3YXZWJHkwaG1DI4euL7
 C4gvd/gmpXGMOxjZLV/D5d7wwoO2REyVkDeCy0lqHCisvd4GRXqUJHe6hJ3GxVv/
 VqbT7ZHMP4BlbWK4/yTq9+wKSC9T0OfAOW2b+K/SAh19TpRqFBpkMOxaZx0nj2Eh
 UxQUd2hntGtEGpPf71HPQMYCkWyvJD1LpTYfwld7tNvzXSt2xg2YZAf7WzJ0Od3b
 kxg2mItHhVRCcwEc9OWlphvgAYCzYECzeMniesLFeJYLMvHNtORzxNPkDLrOYjT6
 0K05DEWJlEqZhIWtaKJ/SSCj7JnfmgubfK3tG3cerCelQZIvUix6ysvDBwPnqdUr
 3O1d7iMlM1UtqMo0GLeIp0q1aRKMsNfRtUvqjaiQ2NjSuyJqZl8t0jfKd6DHcnuy
 WidBMhREULa5qYGE9dxosdTJK1wOhqomTmBSC8EeDUPevLV+DNQUn+oBOVhFvJVk
 gpmsr2mi9+hwC9iNh1K4wslLl9Jc/BlIY0wwC0ysh6eTWQfrvpfXrtHRpmTvwcVW
 g2lt+shYiPDa7q3bvqQL7iqFU6bkMGCs89EFGJEpNbBO076uNZY=
 =2Oej
 -----END PGP SIGNATURE-----

Merge tag 'x86_boot_for_v6.6_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 boot updates from Borislav Petkov:
 "Avoid the baremetal decompressor code when booting on an EFI machine.

  This is mandated by the current tightening of EFI executables
  requirements when used in a secure boot scenario. More specifically,
  an EFI executable cannot have a single section with RWX permissions,
  which conflicts with the in-place kernel decompression that is done
  today.

  Instead, the things required by the booting kernel image are done in
  the EFI stub now.

  Work by Ard Biesheuvel"

* tag 'x86_boot_for_v6.6_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  x86/efistub: Avoid legacy decompressor when doing EFI boot
  x86/efistub: Perform SNP feature test while running in the firmware
  efi/libstub: Add limit argument to efi_random_alloc()
  x86/decompressor: Factor out kernel decompression and relocation
  x86/decompressor: Move global symbol references to C code
  decompress: Use 8 byte alignment
  x86/efistub: Prefer EFI memory attributes protocol over DXE services
  x86/efistub: Perform 4/5 level paging switch from the stub
  x86/decompressor: Merge trampoline cleanup with switching code
  x86/decompressor: Pass pgtable address to trampoline directly
  x86/decompressor: Only call the trampoline when changing paging levels
  x86/decompressor: Call trampoline directly from C code
  x86/decompressor: Avoid the need for a stack in the 32-bit trampoline
  x86/decompressor: Use standard calling convention for trampoline
  x86/decompressor: Call trampoline as a normal function
  x86/decompressor: Assign paging related global variables earlier
  x86/decompressor: Store boot_params pointer in callee save register
  x86/efistub: Clear BSS in EFI handover protocol entrypoint
  x86/decompressor: Avoid magic offsets for EFI handover entrypoint
  x86/efistub: Simplify and clean up handover entry code
  ...
This commit is contained in:
Linus Torvalds 2023-08-28 15:15:37 -07:00
commit bd9e99f790
24 changed files with 589 additions and 564 deletions

View file

@ -1417,7 +1417,7 @@ execution context provided by the EFI firmware.
The function prototype for the handover entry point looks like this:: The function prototype for the handover entry point looks like this::
efi_main(void *handle, efi_system_table_t *table, struct boot_params *bp) efi_stub_entry(void *handle, efi_system_table_t *table, struct boot_params *bp)
'handle' is the EFI image handle passed to the boot loader by the EFI 'handle' is the EFI image handle passed to the boot loader by the EFI
firmware, 'table' is the EFI system table - these are the first two firmware, 'table' is the EFI system table - these are the first two

View file

@ -74,6 +74,11 @@ LDFLAGS_vmlinux += -z noexecstack
ifeq ($(CONFIG_LD_IS_BFD),y) ifeq ($(CONFIG_LD_IS_BFD),y)
LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments) LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments)
endif endif
ifeq ($(CONFIG_EFI_STUB),y)
# ensure that the static EFI stub library will be pulled in, even if it is
# never referenced explicitly from the startup code
LDFLAGS_vmlinux += -u efi_pe_entry
endif
LDFLAGS_vmlinux += -T LDFLAGS_vmlinux += -T
hostprogs := mkpiggy hostprogs := mkpiggy

View file

@ -26,8 +26,8 @@
* When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode() * When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode()
* is the first thing that runs after switching to long mode. Depending on * is the first thing that runs after switching to long mode. Depending on
* whether the EFI handover protocol or the compat entry point was used to * whether the EFI handover protocol or the compat entry point was used to
* enter the kernel, it will either branch to the 64-bit EFI handover * enter the kernel, it will either branch to the common 64-bit EFI stub
* entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF * entrypoint efi_stub_entry() directly, or via the 64-bit EFI PE/COFF
* entrypoint efi_pe_entry(). In the former case, the bootloader must provide a * entrypoint efi_pe_entry(). In the former case, the bootloader must provide a
* struct bootparams pointer as the third argument, so the presence of such a * struct bootparams pointer as the third argument, so the presence of such a
* pointer is used to disambiguate. * pointer is used to disambiguate.
@ -37,21 +37,23 @@
* | efi32_pe_entry |---->| | | +-----------+--+ * | efi32_pe_entry |---->| | | +-----------+--+
* +------------------+ | | +------+----------------+ | * +------------------+ | | +------+----------------+ |
* | startup_32 |---->| startup_64_mixed_mode | | * | startup_32 |---->| startup_64_mixed_mode | |
* +------------------+ | | +------+----------------+ V * +------------------+ | | +------+----------------+ |
* | efi32_stub_entry |---->| | | +------------------+ * | efi32_stub_entry |---->| | | |
* +------------------+ +------------+ +---->| efi64_stub_entry | * +------------------+ +------------+ | |
* +-------------+----+ * V |
* +------------+ +----------+ | * +------------+ +----------------+ |
* | startup_64 |<----| efi_main |<--------------+ * | startup_64 |<----| efi_stub_entry |<--------+
* +------------+ +----------+ * +------------+ +----------------+
*/ */
SYM_FUNC_START(startup_64_mixed_mode) SYM_FUNC_START(startup_64_mixed_mode)
lea efi32_boot_args(%rip), %rdx lea efi32_boot_args(%rip), %rdx
mov 0(%rdx), %edi mov 0(%rdx), %edi
mov 4(%rdx), %esi mov 4(%rdx), %esi
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
mov 8(%rdx), %edx // saved bootparams pointer mov 8(%rdx), %edx // saved bootparams pointer
test %edx, %edx test %edx, %edx
jnz efi64_stub_entry jnz efi_stub_entry
#endif
/* /*
* efi_pe_entry uses MS calling convention, which requires 32 bytes of * efi_pe_entry uses MS calling convention, which requires 32 bytes of
* shadow space on the stack even if all arguments are passed in * shadow space on the stack even if all arguments are passed in
@ -138,6 +140,28 @@ SYM_FUNC_START(__efi64_thunk)
SYM_FUNC_END(__efi64_thunk) SYM_FUNC_END(__efi64_thunk)
.code32 .code32
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
SYM_FUNC_START(efi32_stub_entry)
call 1f
1: popl %ecx
/* Clear BSS */
xorl %eax, %eax
leal (_bss - 1b)(%ecx), %edi
leal (_ebss - 1b)(%ecx), %ecx
subl %edi, %ecx
shrl $2, %ecx
cld
rep stosl
add $0x4, %esp /* Discard return address */
popl %ecx
popl %edx
popl %esi
jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
/* /*
* EFI service pointer must be in %edi. * EFI service pointer must be in %edi.
* *
@ -218,7 +242,7 @@ SYM_FUNC_END(efi_enter32)
* stub may still exit and return to the firmware using the Exit() EFI boot * stub may still exit and return to the firmware using the Exit() EFI boot
* service.] * service.]
*/ */
SYM_FUNC_START(efi32_entry) SYM_FUNC_START_LOCAL(efi32_entry)
call 1f call 1f
1: pop %ebx 1: pop %ebx
@ -245,10 +269,6 @@ SYM_FUNC_START(efi32_entry)
jmp startup_32 jmp startup_32
SYM_FUNC_END(efi32_entry) SYM_FUNC_END(efi32_entry)
#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
/* /*
* efi_status_t efi32_pe_entry(efi_handle_t image_handle, * efi_status_t efi32_pe_entry(efi_handle_t image_handle,
* efi_system_table_32_t *sys_table) * efi_system_table_32_t *sys_table)
@ -256,8 +276,6 @@ SYM_FUNC_END(efi32_entry)
SYM_FUNC_START(efi32_pe_entry) SYM_FUNC_START(efi32_pe_entry)
pushl %ebp pushl %ebp
movl %esp, %ebp movl %esp, %ebp
pushl %eax // dummy push to allocate loaded_image
pushl %ebx // save callee-save registers pushl %ebx // save callee-save registers
pushl %edi pushl %edi
@ -266,48 +284,8 @@ SYM_FUNC_START(efi32_pe_entry)
movl $0x80000003, %eax // EFI_UNSUPPORTED movl $0x80000003, %eax // EFI_UNSUPPORTED
jnz 2f jnz 2f
call 1f
1: pop %ebx
/* Get the loaded image protocol pointer from the image handle */
leal -4(%ebp), %eax
pushl %eax // &loaded_image
leal (loaded_image_proto - 1b)(%ebx), %eax
pushl %eax // pass the GUID address
pushl 8(%ebp) // pass the image handle
/*
* Note the alignment of the stack frame.
* sys_table
* handle <-- 16-byte aligned on entry by ABI
* return address
* frame pointer
* loaded_image <-- local variable
* saved %ebx <-- 16-byte aligned here
* saved %edi
* &loaded_image
* &loaded_image_proto
* handle <-- 16-byte aligned for call to handle_protocol
*/
movl 12(%ebp), %eax // sys_table
movl ST32_boottime(%eax), %eax // sys_table->boottime
call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
addl $12, %esp // restore argument space
testl %eax, %eax
jnz 2f
movl 8(%ebp), %ecx // image_handle movl 8(%ebp), %ecx // image_handle
movl 12(%ebp), %edx // sys_table movl 12(%ebp), %edx // sys_table
movl -4(%ebp), %esi // loaded_image
movl LI32_image_base(%esi), %esi // loaded_image->image_base
leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32
/*
* We need to set the image_offset variable here since startup_32() will
* use it before we get to the 64-bit efi_pe_entry() in C code.
*/
subl %esi, %ebp // calculate image_offset
movl %ebp, (image_offset - 1b)(%ebx) // save image_offset
xorl %esi, %esi xorl %esi, %esi
jmp efi32_entry // pass %ecx, %edx, %esi jmp efi32_entry // pass %ecx, %edx, %esi
// no other registers remain live // no other registers remain live
@ -318,14 +296,13 @@ SYM_FUNC_START(efi32_pe_entry)
RET RET
SYM_FUNC_END(efi32_pe_entry) SYM_FUNC_END(efi32_pe_entry)
.section ".rodata" #ifdef CONFIG_EFI_HANDOVER_PROTOCOL
/* EFI loaded image protocol GUID */ .org efi32_stub_entry + 0x200
.balign 4 .code64
SYM_DATA_START_LOCAL(loaded_image_proto) SYM_FUNC_START_NOALIGN(efi64_stub_entry)
.long 0x5b1b31a1 jmp efi_handover_entry
.word 0x9562, 0x11d2 SYM_FUNC_END(efi64_stub_entry)
.byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b #endif
SYM_DATA_END(loaded_image_proto)
.data .data
.balign 8 .balign 8

View file

@ -84,19 +84,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE #ifdef CONFIG_RELOCATABLE
leal startup_32@GOTOFF(%edx), %ebx leal startup_32@GOTOFF(%edx), %ebx
#ifdef CONFIG_EFI_STUB
/*
* If we were loaded via the EFI LoadImage service, startup_32() will be at an
* offset to the start of the space allocated for the image. efi_pe_entry() will
* set up image_offset to tell us where the image actually starts, so that we
* can use the full available buffer.
* image_offset = startup_32 - image_base
* Otherwise image_offset will be zero and has no effect on the calculations.
*/
subl image_offset@GOTOFF(%edx), %ebx
#endif
movl BP_kernel_alignment(%esi), %eax movl BP_kernel_alignment(%esi), %eax
decl %eax decl %eax
addl %eax, %ebx addl %eax, %ebx
@ -150,17 +137,6 @@ SYM_FUNC_START(startup_32)
jmp *%eax jmp *%eax
SYM_FUNC_END(startup_32) SYM_FUNC_END(startup_32)
#ifdef CONFIG_EFI_STUB
SYM_FUNC_START(efi32_stub_entry)
add $0x4, %esp
movl 8(%esp), %esi /* save boot_params pointer */
call efi_main
/* efi_main returns the possibly relocated address of startup_32 */
jmp *%eax
SYM_FUNC_END(efi32_stub_entry)
SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry)
#endif
.text .text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@ -179,13 +155,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
*/ */
/* push arguments for extract_kernel: */ /* push arguments for extract_kernel: */
pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */
pushl %ebp /* output address */ pushl %ebp /* output address */
pushl input_len@GOTOFF(%ebx) /* input_len */
leal input_data@GOTOFF(%ebx), %eax
pushl %eax /* input_data */
leal boot_heap@GOTOFF(%ebx), %eax
pushl %eax /* heap area */
pushl %esi /* real mode pointer */ pushl %esi /* real mode pointer */
call extract_kernel /* returns kernel entry point in %eax */ call extract_kernel /* returns kernel entry point in %eax */
addl $24, %esp addl $24, %esp
@ -213,8 +183,6 @@ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
*/ */
.bss .bss
.balign 4 .balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack: boot_stack:
.fill BOOT_STACK_SIZE, 1, 0 .fill BOOT_STACK_SIZE, 1, 0
boot_stack_end: boot_stack_end:

View file

@ -146,19 +146,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE #ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx movl %ebp, %ebx
#ifdef CONFIG_EFI_STUB
/*
* If we were loaded via the EFI LoadImage service, startup_32 will be at an
* offset to the start of the space allocated for the image. efi_pe_entry will
* set up image_offset to tell us where the image actually starts, so that we
* can use the full available buffer.
* image_offset = startup_32 - image_base
* Otherwise image_offset will be zero and has no effect on the calculations.
*/
subl rva(image_offset)(%ebp), %ebx
#endif
movl BP_kernel_alignment(%esi), %eax movl BP_kernel_alignment(%esi), %eax
decl %eax decl %eax
addl %eax, %ebx addl %eax, %ebx
@ -294,17 +281,6 @@ SYM_FUNC_START(startup_32)
lret lret
SYM_FUNC_END(startup_32) SYM_FUNC_END(startup_32)
#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL)
.org 0x190
SYM_FUNC_START(efi32_stub_entry)
add $0x4, %esp /* Discard return address */
popl %ecx
popl %edx
popl %esi
jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
.code64 .code64
.org 0x200 .org 0x200
SYM_CODE_START(startup_64) SYM_CODE_START(startup_64)
@ -346,20 +322,6 @@ SYM_CODE_START(startup_64)
/* Start with the delta to where the kernel will run at. */ /* Start with the delta to where the kernel will run at. */
#ifdef CONFIG_RELOCATABLE #ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip) /* - $startup_32 */, %rbp leaq startup_32(%rip) /* - $startup_32 */, %rbp
#ifdef CONFIG_EFI_STUB
/*
* If we were loaded via the EFI LoadImage service, startup_32 will be at an
* offset to the start of the space allocated for the image. efi_pe_entry will
* set up image_offset to tell us where the image actually starts, so that we
* can use the full available buffer.
* image_offset = startup_32 - image_base
* Otherwise image_offset will be zero and has no effect on the calculations.
*/
movl image_offset(%rip), %eax
subq %rax, %rbp
#endif
movl BP_kernel_alignment(%rsi), %eax movl BP_kernel_alignment(%rsi), %eax
decl %eax decl %eax
addq %rax, %rbp addq %rax, %rbp
@ -398,10 +360,6 @@ SYM_CODE_START(startup_64)
* For the trampoline, we need the top page table to reside in lower * For the trampoline, we need the top page table to reside in lower
* memory as we don't have a way to load 64-bit values into CR3 in * memory as we don't have a way to load 64-bit values into CR3 in
* 32-bit mode. * 32-bit mode.
*
* We go though the trampoline even if we don't have to: if we're
* already in a desired paging mode. This way the trampoline code gets
* tested on every boot.
*/ */
/* Make sure we have GDT with 32-bit code segment */ /* Make sure we have GDT with 32-bit code segment */
@ -416,10 +374,14 @@ SYM_CODE_START(startup_64)
lretq lretq
.Lon_kernel_cs: .Lon_kernel_cs:
/*
* RSI holds a pointer to a boot_params structure provided by the
* loader, and this needs to be preserved across C function calls. So
* move it into a callee saved register.
*/
movq %rsi, %r15
pushq %rsi
call load_stage1_idt call load_stage1_idt
popq %rsi
#ifdef CONFIG_AMD_MEM_ENCRYPT #ifdef CONFIG_AMD_MEM_ENCRYPT
/* /*
@ -430,63 +392,24 @@ SYM_CODE_START(startup_64)
* CPUID instructions being issued, so go ahead and do that now via * CPUID instructions being issued, so go ahead and do that now via
* sev_enable(), which will also handle the rest of the SEV-related * sev_enable(), which will also handle the rest of the SEV-related
* detection/setup to ensure that has been done in advance of any dependent * detection/setup to ensure that has been done in advance of any dependent
* code. * code. Pass the boot_params pointer as the first argument.
*/ */
pushq %rsi movq %r15, %rdi
movq %rsi, %rdi /* real mode address */
call sev_enable call sev_enable
popq %rsi
#endif #endif
/* /*
* paging_prepare() sets up the trampoline and checks if we need to * configure_5level_paging() updates the number of paging levels using
* enable 5-level paging. * a trampoline in 32-bit addressable memory if the current number does
* not match the desired number.
* *
* paging_prepare() returns a two-quadword structure which lands * Pass the boot_params pointer as the first argument. The second
* into RDX:RAX: * argument is the relocated address of the page table to use instead
* - Address of the trampoline is returned in RAX. * of the page table in trampoline memory (if required).
* - Non zero RDX means trampoline needs to enable 5-level
* paging.
*
* RSI holds real mode data and needs to be preserved across
* this function call.
*/ */
pushq %rsi movq %r15, %rdi
movq %rsi, %rdi /* real mode address */ leaq rva(top_pgtable)(%rbx), %rsi
call paging_prepare call configure_5level_paging
popq %rsi
/* Save the trampoline address in RCX */
movq %rax, %rcx
/*
* Load the address of trampoline_return() into RDI.
* It will be used by the trampoline to return to the main code.
*/
leaq trampoline_return(%rip), %rdi
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
pushq %rax
lretq
trampoline_return:
/* Restore the stack, the 32-bit trampoline uses its own stack */
leaq rva(boot_stack_end)(%rbx), %rsp
/*
* cleanup_trampoline() would restore trampoline memory.
*
* RDI is address of the page table to use instead of page table
* in trampoline memory (if required).
*
* RSI holds real mode data and needs to be preserved across
* this function call.
*/
pushq %rsi
leaq rva(top_pgtable)(%rbx), %rdi
call cleanup_trampoline
popq %rsi
/* Zero EFLAGS */ /* Zero EFLAGS */
pushq $0 pushq $0
@ -496,7 +419,6 @@ trampoline_return:
* Copy the compressed kernel to the end of our buffer * Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe. * where decompression in place becomes safe.
*/ */
pushq %rsi
leaq (_bss-8)(%rip), %rsi leaq (_bss-8)(%rip), %rsi
leaq rva(_bss-8)(%rbx), %rdi leaq rva(_bss-8)(%rbx), %rdi
movl $(_bss - startup_32), %ecx movl $(_bss - startup_32), %ecx
@ -504,7 +426,6 @@ trampoline_return:
std std
rep movsq rep movsq
cld cld
popq %rsi
/* /*
* The GDT may get overwritten either during the copy we just did or * The GDT may get overwritten either during the copy we just did or
@ -523,21 +444,6 @@ trampoline_return:
jmp *%rax jmp *%rax
SYM_CODE_END(startup_64) SYM_CODE_END(startup_64)
#ifdef CONFIG_EFI_STUB
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
.org 0x390
#endif
SYM_FUNC_START(efi64_stub_entry)
and $~0xf, %rsp /* realign the stack */
movq %rdx, %rbx /* save boot_params pointer */
call efi_main
movq %rbx,%rsi
leaq rva(startup_64)(%rax), %rax
jmp *%rax
SYM_FUNC_END(efi64_stub_entry)
SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry)
#endif
.text .text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@ -551,128 +457,122 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
shrq $3, %rcx shrq $3, %rcx
rep stosq rep stosq
pushq %rsi
call load_stage2_idt call load_stage2_idt
/* Pass boot_params to initialize_identity_maps() */ /* Pass boot_params to initialize_identity_maps() */
movq (%rsp), %rdi movq %r15, %rdi
call initialize_identity_maps call initialize_identity_maps
popq %rsi
/* /*
* Do the extraction, and jump to the new kernel.. * Do the extraction, and jump to the new kernel..
*/ */
pushq %rsi /* Save the real mode argument */ /* pass struct boot_params pointer and output target address */
movq %rsi, %rdi /* real mode address */ movq %r15, %rdi
leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ movq %rbp, %rsi
leaq input_data(%rip), %rdx /* input_data */
movl input_len(%rip), %ecx /* input_len */
movq %rbp, %r8 /* output target address */
movl output_len(%rip), %r9d /* decompressed length, end of relocs */
call extract_kernel /* returns kernel entry point in %rax */ call extract_kernel /* returns kernel entry point in %rax */
popq %rsi
/* /*
* Jump to the decompressed kernel. * Jump to the decompressed kernel.
*/ */
movq %r15, %rsi
jmp *%rax jmp *%rax
SYM_FUNC_END(.Lrelocated) SYM_FUNC_END(.Lrelocated)
.code32
/* /*
* This is the 32-bit trampoline that will be copied over to low memory. * This is the 32-bit trampoline that will be copied over to low memory. It
* will be called using the ordinary 64-bit calling convention from code
* running in 64-bit mode.
* *
* RDI contains the return address (might be above 4G). * Return address is at the top of the stack (might be above 4G).
* ECX contains the base address of the trampoline memory. * The first argument (EDI) contains the address of the temporary PGD level
* Non zero RDX means trampoline needs to enable 5-level paging. * page table in 32-bit addressable memory which will be programmed into
* register CR3.
*/ */
.section ".rodata", "a", @progbits
SYM_CODE_START(trampoline_32bit_src) SYM_CODE_START(trampoline_32bit_src)
/* Set up data and stack segments */ /*
movl $__KERNEL_DS, %eax * Preserve callee save 64-bit registers on the stack: this is
movl %eax, %ds * necessary because the architecture does not guarantee that GPRs will
movl %eax, %ss * retain their full 64-bit values across a 32-bit mode switch.
*/
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
pushq %rbx
/* Set up new stack */ /* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp movq %rsp, %rbx
shrq $32, %rbx
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq 0f(%rip), %rax
pushq %rax
lretq
/*
* The 32-bit code below will do a far jump back to long mode and end
* up here after reconfiguring the number of paging levels. First, the
* stack pointer needs to be restored to its full 64-bit value before
* the callee save register contents can be popped from the stack.
*/
.Lret:
shlq $32, %rbx
orq %rbx, %rsp
/* Restore the preserved 64-bit registers */
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
retq
.code32
0:
/* Disable paging */ /* Disable paging */
movl %cr0, %eax movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0 movl %eax, %cr0
/* Check what paging mode we want to be in after the trampoline */
testl %edx, %edx
jz 1f
/* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
movl %cr4, %eax
testl $X86_CR4_LA57, %eax
jnz 3f
jmp 2f
1:
/* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
movl %cr4, %eax
testl $X86_CR4_LA57, %eax
jz 3f
2:
/* Point CR3 to the trampoline's new top level page table */ /* Point CR3 to the trampoline's new top level page table */
leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax movl %edi, %cr3
movl %eax, %cr3
3:
/* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */ /* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
pushl %ecx
pushl %edx
movl $MSR_EFER, %ecx movl $MSR_EFER, %ecx
rdmsr rdmsr
btsl $_EFER_LME, %eax btsl $_EFER_LME, %eax
/* Avoid writing EFER if no change was made (for TDX guest) */ /* Avoid writing EFER if no change was made (for TDX guest) */
jc 1f jc 1f
wrmsr wrmsr
1: popl %edx
popl %ecx
#ifdef CONFIG_X86_MCE
/*
* Preserve CR4.MCE if the kernel will enable #MC support.
* Clearing MCE may fault in some environments (that also force #MC
* support). Any machine check that occurs before #MC support is fully
* configured will crash the system regardless of the CR4.MCE value set
* here.
*/
movl %cr4, %eax
andl $X86_CR4_MCE, %eax
#else
movl $0, %eax
#endif
/* Enable PAE and LA57 (if required) paging modes */
orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
1: 1:
/* Toggle CR4.LA57 */
movl %cr4, %eax
btcl $X86_CR4_LA57_BIT, %eax
movl %eax, %cr4 movl %eax, %cr4
/* Calculate address of paging_enabled() once we are executing in the trampoline */
leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
/* Prepare the stack for far return to Long Mode */
pushl $__KERNEL_CS
pushl %eax
/* Enable paging again. */ /* Enable paging again. */
movl %cr0, %eax movl %cr0, %eax
btsl $X86_CR0_PG_BIT, %eax btsl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0 movl %eax, %cr0
lret /*
* Return to the 64-bit calling code using LJMP rather than LRET, to
* avoid the need for a 32-bit addressable stack. The destination
* address will be adjusted after the template code is copied into a
* 32-bit addressable buffer.
*/
.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
SYM_CODE_END(trampoline_32bit_src) SYM_CODE_END(trampoline_32bit_src)
.code64 /*
SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled) * This symbol is placed right after trampoline_32bit_src() so its address can
/* Return from the trampoline */ * be used to infer the size of the trampoline code.
jmp *%rdi */
SYM_FUNC_END(.Lpaging_enabled) SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
/* /*
* The trampoline code has a size limit. * The trampoline code has a size limit.
@ -681,7 +581,7 @@ SYM_FUNC_END(.Lpaging_enabled)
*/ */
.org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
.code32 .text
SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode) SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1: 1:
@ -726,8 +626,6 @@ SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
*/ */
.bss .bss
.balign 4 .balign 4
SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0)
SYM_DATA_START_LOCAL(boot_stack) SYM_DATA_START_LOCAL(boot_stack)
.fill BOOT_STACK_SIZE, 1, 0 .fill BOOT_STACK_SIZE, 1, 0
.balign 16 .balign 16

View file

@ -330,6 +330,33 @@ static size_t parse_elf(void *output)
return ehdr.e_entry - LOAD_PHYSICAL_ADDR; return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
} }
const unsigned long kernel_total_size = VO__end - VO__text;
static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
extern unsigned char input_data[];
extern unsigned int input_len, output_len;
unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
void (*error)(char *x))
{
unsigned long entry;
if (!free_mem_ptr) {
free_mem_ptr = (unsigned long)boot_heap;
free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
}
if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len,
NULL, error) < 0)
return ULONG_MAX;
entry = parse_elf(outbuf);
handle_relocations(outbuf, output_len, virt_addr);
return entry;
}
/* /*
* The compressed kernel image (ZO), has been moved so that its position * The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel * is against the end of the buffer used to hold the uncompressed kernel
@ -347,14 +374,10 @@ static size_t parse_elf(void *output)
* |-------uncompressed kernel image---------| * |-------uncompressed kernel image---------|
* *
*/ */
asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
unsigned char *input_data,
unsigned long input_len,
unsigned char *output,
unsigned long output_len)
{ {
const unsigned long kernel_total_size = VO__end - VO__text;
unsigned long virt_addr = LOAD_PHYSICAL_ADDR; unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
memptr heap = (memptr)boot_heap;
unsigned long needed_size; unsigned long needed_size;
size_t entry_offset; size_t entry_offset;
@ -412,7 +435,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
* entries. This ensures the full mapped area is usable RAM * entries. This ensures the full mapped area is usable RAM
* and doesn't include any reserved areas. * and doesn't include any reserved areas.
*/ */
needed_size = max(output_len, kernel_total_size); needed_size = max_t(unsigned long, output_len, kernel_total_size);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN);
#endif #endif
@ -443,7 +466,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL) if (heap > 0x3fffffffffffUL)
error("Destination address too large"); error("Destination address too large");
if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE) if (virt_addr + needed_size > KERNEL_IMAGE_SIZE)
error("Destination virtual address is beyond the kernel mapping area"); error("Destination virtual address is beyond the kernel mapping area");
#else #else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
@ -461,10 +484,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
accept_memory(__pa(output), __pa(output) + needed_size); accept_memory(__pa(output), __pa(output) + needed_size);
} }
__decompress(input_data, input_len, NULL, NULL, output, output_len, entry_offset = decompress_kernel(output, virt_addr, error);
NULL, error);
entry_offset = parse_elf(output);
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel (entry_offset: 0x"); debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
debug_puthex(entry_offset); debug_puthex(entry_offset);

View file

@ -179,9 +179,7 @@ static inline int count_immovable_mem_regions(void) { return 0; }
#endif #endif
/* ident_map_64.c */ /* ident_map_64.c */
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d; extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
#endif
extern void kernel_add_identity_map(unsigned long start, unsigned long end); extern void kernel_add_identity_map(unsigned long start, unsigned long end);
/* Used by PAGE_KERN* macros: */ /* Used by PAGE_KERN* macros: */

View file

@ -3,18 +3,16 @@
#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE) #define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE #define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
#define TRAMPOLINE_32BIT_CODE_SIZE 0x80 #define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
#ifndef __ASSEMBLER__ #ifndef __ASSEMBLER__
extern unsigned long *trampoline_32bit; extern unsigned long *trampoline_32bit;
extern void trampoline_32bit_src(void *return_ptr); extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
extern const u16 trampoline_ljmp_imm_offset;
#endif /* __ASSEMBLER__ */ #endif /* __ASSEMBLER__ */
#endif /* BOOT_COMPRESSED_PAGETABLE_H */ #endif /* BOOT_COMPRESSED_PAGETABLE_H */

View file

@ -16,11 +16,6 @@ unsigned int __section(".data") pgdir_shift = 39;
unsigned int __section(".data") ptrs_per_p4d = 1; unsigned int __section(".data") ptrs_per_p4d = 1;
#endif #endif
struct paging_config {
unsigned long trampoline_start;
unsigned long l5_required;
};
/* Buffer to preserve trampoline memory */ /* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
@ -29,7 +24,7 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
* purposes. * purposes.
* *
* Avoid putting the pointer into .bss as it will be cleared between * Avoid putting the pointer into .bss as it will be cleared between
* paging_prepare() and extract_kernel(). * configure_5level_paging() and extract_kernel().
*/ */
unsigned long *trampoline_32bit __section(".data"); unsigned long *trampoline_32bit __section(".data");
@ -106,12 +101,13 @@ static unsigned long find_trampoline_placement(void)
return bios_start - TRAMPOLINE_32BIT_SIZE; return bios_start - TRAMPOLINE_32BIT_SIZE;
} }
struct paging_config paging_prepare(void *rmode) asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
{ {
struct paging_config paging_config = {}; void (*toggle_la57)(void *cr3);
bool l5_required = false;
/* Initialize boot_params. Required for cmdline_find_option_bool(). */ /* Initialize boot_params. Required for cmdline_find_option_bool(). */
boot_params = rmode; boot_params = bp;
/* /*
* Check if LA57 is desired and supported. * Check if LA57 is desired and supported.
@ -129,12 +125,22 @@ struct paging_config paging_prepare(void *rmode)
!cmdline_find_option_bool("no5lvl") && !cmdline_find_option_bool("no5lvl") &&
native_cpuid_eax(0) >= 7 && native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) { (native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
paging_config.l5_required = 1; l5_required = true;
/* Initialize variables for 5-level paging */
__pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
} }
paging_config.trampoline_start = find_trampoline_placement(); /*
* The trampoline will not be used if the paging mode is already set to
* the desired one.
*/
if (l5_required == !!(native_read_cr4() & X86_CR4_LA57))
return;
trampoline_32bit = (unsigned long *)paging_config.trampoline_start; trampoline_32bit = (unsigned long *)find_trampoline_placement();
/* Preserve trampoline memory */ /* Preserve trampoline memory */
memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE); memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
@ -143,32 +149,32 @@ struct paging_config paging_prepare(void *rmode)
memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE); memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
/* Copy trampoline code in place */ /* Copy trampoline code in place */
memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long), toggle_la57 = memcpy(trampoline_32bit +
TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
&trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE); &trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
/*
* Avoid the need for a stack in the 32-bit trampoline code, by using
* LJMP rather than LRET to return back to long mode. LJMP takes an
* immediate absolute address, which needs to be adjusted based on the
* placement of the trampoline.
*/
*(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) +=
(unsigned long)toggle_la57;
/* /*
* The code below prepares page table in trampoline memory. * The code below prepares page table in trampoline memory.
* *
* The new page table will be used by trampoline code for switching * The new page table will be used by trampoline code for switching
* from 4- to 5-level paging or vice versa. * from 4- to 5-level paging or vice versa.
*
* If switching is not required, the page table is unused: trampoline
* code wouldn't touch CR3.
*/ */
/* if (l5_required) {
* We are not going to use the page table in trampoline memory if we
* are already in the desired paging mode.
*/
if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57))
goto out;
if (paging_config.l5_required) {
/* /*
* For 4- to 5-level paging transition, set up current CR3 as * For 4- to 5-level paging transition, set up current CR3 as
* the first and the only entry in a new top-level page table. * the first and the only entry in a new top-level page table.
*/ */
trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC; *trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC;
} else { } else {
unsigned long src; unsigned long src;
@ -181,38 +187,17 @@ struct paging_config paging_prepare(void *rmode)
* may be above 4G. * may be above 4G.
*/ */
src = *(unsigned long *)__native_read_cr3() & PAGE_MASK; src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long), memcpy(trampoline_32bit, (void *)src, PAGE_SIZE);
(void *)src, PAGE_SIZE);
} }
out: toggle_la57(trampoline_32bit);
return paging_config;
}
void cleanup_trampoline(void *pgtable)
{
void *trampoline_pgtable;
trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
/* /*
* Move the top level page table out of trampoline memory, * Move the top level page table out of trampoline memory.
* if it's there.
*/ */
if ((void *)__native_read_cr3() == trampoline_pgtable) { memcpy(pgtable, trampoline_32bit, PAGE_SIZE);
memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); native_write_cr3((unsigned long)pgtable);
native_write_cr3((unsigned long)pgtable);
}
/* Restore trampoline memory */ /* Restore trampoline memory */
memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE); memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
/* Initialize variables for 5-level paging */
#ifdef CONFIG_X86_5LEVEL
if (__read_cr4() & X86_CR4_LA57) {
__pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
}
#endif
} }

View file

@ -367,20 +367,25 @@ static void enforce_vmpl0(void)
*/ */
#define SNP_FEATURES_PRESENT (0) #define SNP_FEATURES_PRESENT (0)
u64 snp_get_unsupported_features(u64 status)
{
if (!(status & MSR_AMD64_SEV_SNP_ENABLED))
return 0;
return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
}
void snp_check_features(void) void snp_check_features(void)
{ {
u64 unsupported; u64 unsupported;
if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
return;
/* /*
* Terminate the boot if hypervisor has enabled any feature lacking * Terminate the boot if hypervisor has enabled any feature lacking
* guest side implementation. Pass on the unsupported features mask through * guest side implementation. Pass on the unsupported features mask through
* EXIT_INFO_2 of the GHCB protocol so that those features can be reported * EXIT_INFO_2 of the GHCB protocol so that those features can be reported
* as part of the guest boot failure. * as part of the guest boot failure.
*/ */
unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT; unsupported = snp_get_unsupported_features(sev_status);
if (unsupported) { if (unsupported) {
if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb())) if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
@ -390,10 +395,45 @@ void snp_check_features(void)
} }
} }
void sev_enable(struct boot_params *bp) /*
* sev_check_cpu_support - Check for SEV support in the CPU capabilities
*
* Returns < 0 if SEV is not supported, otherwise the position of the
* encryption bit in the page table descriptors.
*/
static int sev_check_cpu_support(void)
{ {
unsigned int eax, ebx, ecx, edx; unsigned int eax, ebx, ecx, edx;
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return -ENODEV;
/*
* Check for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
* - Bit 0 - Secure Memory Encryption support
* - Bit 1 - Secure Encrypted Virtualization support
* CPUID Fn8000_001F[EBX]
* - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1)))
return -ENODEV;
return ebx & 0x3f;
}
void sev_enable(struct boot_params *bp)
{
struct msr m; struct msr m;
int bitpos;
bool snp; bool snp;
/* /*
@ -413,26 +453,7 @@ void sev_enable(struct boot_params *bp)
* which is good enough. * which is good enough.
*/ */
/* Check for the SME/SEV support leaf */ if (sev_check_cpu_support() < 0)
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return;
/*
* Check for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
* - Bit 0 - Secure Memory Encryption support
* - Bit 1 - Secure Encrypted Virtualization support
* CPUID Fn8000_001F[EBX]
* - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1)))
return; return;
/* /*
@ -443,26 +464,8 @@ void sev_enable(struct boot_params *bp)
/* Now repeat the checks with the SNP CPUID table. */ /* Now repeat the checks with the SNP CPUID table. */
/* Recheck the SME/SEV support leaf */ bitpos = sev_check_cpu_support();
eax = 0x80000000; if (bitpos < 0) {
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return;
/*
* Recheck for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
* - Bit 0 - Secure Memory Encryption support
* - Bit 1 - Secure Encrypted Virtualization support
* CPUID Fn8000_001F[EBX]
* - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1))) {
if (snp) if (snp)
error("SEV-SNP support indicated by CC blob, but not CPUID."); error("SEV-SNP support indicated by CC blob, but not CPUID.");
return; return;
@ -494,7 +497,24 @@ void sev_enable(struct boot_params *bp)
if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
sme_me_mask = BIT_ULL(ebx & 0x3f); sme_me_mask = BIT_ULL(bitpos);
}
/*
* sev_get_status - Retrieve the SEV status mask
*
* Returns 0 if the CPU is not SEV capable, otherwise the value of the
* AMD64_SEV MSR.
*/
u64 sev_get_status(void)
{
struct msr m;
if (sev_check_cpu_support() < 0)
return 0;
boot_rdmsr(MSR_AMD64_SEV, &m);
return m.q;
} }
/* Search for Confidential Computing blob in the EFI config table. */ /* Search for Confidential Computing blob in the EFI config table. */

View file

@ -62,4 +62,12 @@
# define BOOT_STACK_SIZE 0x1000 # define BOOT_STACK_SIZE 0x1000
#endif #endif
#ifndef __ASSEMBLY__
extern unsigned int output_len;
extern const unsigned long kernel_total_size;
unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
void (*error)(char *x));
#endif
#endif /* _ASM_X86_BOOT_H */ #endif /* _ASM_X86_BOOT_H */

View file

@ -90,6 +90,8 @@ static inline void efi_fpu_end(void)
} }
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1)
#define arch_efi_call_virt_setup() \ #define arch_efi_call_virt_setup() \
({ \ ({ \
efi_fpu_begin(); \ efi_fpu_begin(); \
@ -103,8 +105,7 @@ static inline void efi_fpu_end(void)
}) })
#else /* !CONFIG_X86_32 */ #else /* !CONFIG_X86_32 */
#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT
#define EFI_LOADER_SIGNATURE "EL64"
extern asmlinkage u64 __efi_call(void *fp, ...); extern asmlinkage u64 __efi_call(void *fp, ...);
@ -218,6 +219,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
#ifdef CONFIG_EFI_MIXED #ifdef CONFIG_EFI_MIXED
#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX)
#define ARCH_HAS_EFISTUB_WRAPPERS #define ARCH_HAS_EFISTUB_WRAPPERS
static inline bool efi_is_64bit(void) static inline bool efi_is_64bit(void)

View file

@ -164,6 +164,7 @@ static __always_inline void sev_es_nmi_complete(void)
__sev_es_nmi_complete(); __sev_es_nmi_complete();
} }
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd); extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
extern void sev_enable(struct boot_params *bp);
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
{ {
@ -210,12 +211,15 @@ bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void); void __init __noreturn snp_abort(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio); int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end); void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
u64 sev_get_status(void);
#else #else
static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { } static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { } static inline void sev_es_nmi_complete(void) { }
static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
static inline void sev_enable(struct boot_params *bp) { }
static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
static inline void setup_ghcb(void) { } static inline void setup_ghcb(void) { }
@ -235,6 +239,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
} }
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
static inline u64 sev_get_status(void) { return 0; }
#endif #endif
#endif #endif

View file

@ -51,7 +51,9 @@ SYM_CODE_START_NOALIGN(startup_64)
* for us. These identity mapped page tables map all of the * for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory. * kernel pages and possibly all of memory.
* *
* %rsi holds a physical pointer to real_mode_data. * %RSI holds the physical address of the boot_params structure
* provided by the bootloader. Preserve it in %R15 so C function calls
* will not clobber it.
* *
* We come here either directly from a 64bit bootloader, or from * We come here either directly from a 64bit bootloader, or from
* arch/x86/boot/compressed/head_64.S. * arch/x86/boot/compressed/head_64.S.
@ -62,6 +64,7 @@ SYM_CODE_START_NOALIGN(startup_64)
* compiled to run at we first fixup the physical addresses in our page * compiled to run at we first fixup the physical addresses in our page
* tables and then reload them. * tables and then reload them.
*/ */
mov %rsi, %r15
/* Set up the stack for verify_cpu() */ /* Set up the stack for verify_cpu() */
leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
@ -75,9 +78,7 @@ SYM_CODE_START_NOALIGN(startup_64)
shrq $32, %rdx shrq $32, %rdx
wrmsr wrmsr
pushq %rsi
call startup_64_setup_env call startup_64_setup_env
popq %rsi
/* Now switch to __KERNEL_CS so IRET works reliably */ /* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS pushq $__KERNEL_CS
@ -93,12 +94,10 @@ SYM_CODE_START_NOALIGN(startup_64)
* Activate SEV/SME memory encryption if supported/enabled. This needs to * Activate SEV/SME memory encryption if supported/enabled. This needs to
* be done now, since this also includes setup of the SEV-SNP CPUID table, * be done now, since this also includes setup of the SEV-SNP CPUID table,
* which needs to be done before any CPUID instructions are executed in * which needs to be done before any CPUID instructions are executed in
* subsequent code. * subsequent code. Pass the boot_params pointer as the first argument.
*/ */
movq %rsi, %rdi movq %r15, %rdi
pushq %rsi
call sme_enable call sme_enable
popq %rsi
#endif #endif
/* Sanitize CPU configuration */ /* Sanitize CPU configuration */
@ -111,9 +110,8 @@ SYM_CODE_START_NOALIGN(startup_64)
* programmed into CR3. * programmed into CR3.
*/ */
leaq _text(%rip), %rdi leaq _text(%rip), %rdi
pushq %rsi movq %r15, %rsi
call __startup_64 call __startup_64
popq %rsi
/* Form the CR3 value being sure to include the CR3 modifier */ /* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax addq $(early_top_pgt - __START_KERNEL_map), %rax
@ -127,8 +125,6 @@ SYM_CODE_START(secondary_startup_64)
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table. * and someone has loaded a mapped page table.
* *
* %rsi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses) * We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses). * or from trampoline.S (using virtual addresses).
* *
@ -153,6 +149,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_END_OF_STACK UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR ANNOTATE_NOENDBR
/* Clear %R15 which holds the boot_params pointer on the boot CPU */
xorq %r15, %r15
/* /*
* Retrieve the modifier (SME encryption mask if SME is active) to be * Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3. * added to the initial pgdir entry that will be programmed into CR3.
@ -199,13 +198,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* hypervisor could lie about the C-bit position to perform a ROP * hypervisor could lie about the C-bit position to perform a ROP
* attack on the guest by writing to the unencrypted stack and wait for * attack on the guest by writing to the unencrypted stack and wait for
* the next RET instruction. * the next RET instruction.
* %rsi carries pointer to realmode data and is callee-clobbered. Save
* and restore it.
*/ */
pushq %rsi
movq %rax, %rdi movq %rax, %rdi
call sev_verify_cbit call sev_verify_cbit
popq %rsi
/* /*
* Switch to new page-table * Switch to new page-table
@ -365,9 +360,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
wrmsr wrmsr
/* Setup and Load IDT */ /* Setup and Load IDT */
pushq %rsi
call early_setup_idt call early_setup_idt
popq %rsi
/* Check if nx is implemented */ /* Check if nx is implemented */
movl $0x80000001, %eax movl $0x80000001, %eax
@ -403,9 +396,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
pushq $0 pushq $0
popfq popfq
/* rsi is pointer to real mode structure with interesting info. /* Pass the boot_params pointer as first argument */
pass it to C */ movq %r15, %rdi
movq %rsi, %rdi
.Ljump_to_C_code: .Ljump_to_C_code:
/* /*

View file

@ -88,6 +88,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB) += efi-stub.o string.o intrinsics.o systable.o \
lib-$(CONFIG_ARM) += arm32-stub.o lib-$(CONFIG_ARM) += arm32-stub.o
lib-$(CONFIG_ARM64) += arm64.o arm64-stub.o smbios.o lib-$(CONFIG_ARM64) += arm64.o arm64-stub.o smbios.o
lib-$(CONFIG_X86) += x86-stub.o lib-$(CONFIG_X86) += x86-stub.o
lib-$(CONFIG_X86_64) += x86-5lvl.o
lib-$(CONFIG_RISCV) += riscv.o riscv-stub.o lib-$(CONFIG_RISCV) += riscv.o riscv-stub.o
lib-$(CONFIG_LOONGARCH) += loongarch.o loongarch-stub.o lib-$(CONFIG_LOONGARCH) += loongarch.o loongarch-stub.o

View file

@ -106,7 +106,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr,
*/ */
status = efi_random_alloc(*reserve_size, min_kimg_align, status = efi_random_alloc(*reserve_size, min_kimg_align,
reserve_addr, phys_seed, reserve_addr, phys_seed,
EFI_LOADER_CODE); EFI_LOADER_CODE, EFI_ALLOC_LIMIT);
if (status != EFI_SUCCESS) if (status != EFI_SUCCESS)
efi_warn("efi_random_alloc() failed: 0x%lx\n", status); efi_warn("efi_random_alloc() failed: 0x%lx\n", status);
} else { } else {

View file

@ -73,6 +73,8 @@ efi_status_t efi_parse_options(char const *cmdline)
efi_loglevel = CONSOLE_LOGLEVEL_QUIET; efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
} else if (!strcmp(param, "noinitrd")) { } else if (!strcmp(param, "noinitrd")) {
efi_noinitrd = true; efi_noinitrd = true;
} else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
efi_no5lvl = true;
} else if (!strcmp(param, "efi") && val) { } else if (!strcmp(param, "efi") && val) {
efi_nochunk = parse_option_str(val, "nochunk"); efi_nochunk = parse_option_str(val, "nochunk");
efi_novamap |= parse_option_str(val, "novamap"); efi_novamap |= parse_option_str(val, "novamap");

View file

@ -33,6 +33,7 @@
#define EFI_ALLOC_LIMIT ULONG_MAX #define EFI_ALLOC_LIMIT ULONG_MAX
#endif #endif
extern bool efi_no5lvl;
extern bool efi_nochunk; extern bool efi_nochunk;
extern bool efi_nokaslr; extern bool efi_nokaslr;
extern int efi_loglevel; extern int efi_loglevel;
@ -955,7 +956,7 @@ efi_status_t efi_get_random_bytes(unsigned long size, u8 *out);
efi_status_t efi_random_alloc(unsigned long size, unsigned long align, efi_status_t efi_random_alloc(unsigned long size, unsigned long align,
unsigned long *addr, unsigned long random_seed, unsigned long *addr, unsigned long random_seed,
int memory_type); int memory_type, unsigned long alloc_limit);
efi_status_t efi_random_get_seed(void); efi_status_t efi_random_get_seed(void);

View file

@ -16,7 +16,8 @@
*/ */
static unsigned long get_entry_num_slots(efi_memory_desc_t *md, static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
unsigned long size, unsigned long size,
unsigned long align_shift) unsigned long align_shift,
u64 alloc_limit)
{ {
unsigned long align = 1UL << align_shift; unsigned long align = 1UL << align_shift;
u64 first_slot, last_slot, region_end; u64 first_slot, last_slot, region_end;
@ -29,7 +30,7 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
return 0; return 0;
region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1, region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1,
(u64)EFI_ALLOC_LIMIT); alloc_limit);
if (region_end < size) if (region_end < size)
return 0; return 0;
@ -54,7 +55,8 @@ efi_status_t efi_random_alloc(unsigned long size,
unsigned long align, unsigned long align,
unsigned long *addr, unsigned long *addr,
unsigned long random_seed, unsigned long random_seed,
int memory_type) int memory_type,
unsigned long alloc_limit)
{ {
unsigned long total_slots = 0, target_slot; unsigned long total_slots = 0, target_slot;
unsigned long total_mirrored_slots = 0; unsigned long total_mirrored_slots = 0;
@ -76,7 +78,7 @@ efi_status_t efi_random_alloc(unsigned long size,
efi_memory_desc_t *md = (void *)map->map + map_offset; efi_memory_desc_t *md = (void *)map->map + map_offset;
unsigned long slots; unsigned long slots;
slots = get_entry_num_slots(md, size, ilog2(align)); slots = get_entry_num_slots(md, size, ilog2(align), alloc_limit);
MD_NUM_SLOTS(md) = slots; MD_NUM_SLOTS(md) = slots;
total_slots += slots; total_slots += slots;
if (md->attribute & EFI_MEMORY_MORE_RELIABLE) if (md->attribute & EFI_MEMORY_MORE_RELIABLE)

View file

@ -0,0 +1,95 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/efi.h>
#include <asm/boot.h>
#include <asm/desc.h>
#include <asm/efi.h>
#include "efistub.h"
#include "x86-stub.h"
bool efi_no5lvl;
static void (*la57_toggle)(void *cr3);
static const struct desc_struct gdt[] = {
[GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
[GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
};
/*
* Enabling (or disabling) 5 level paging is tricky, because it can only be
* done from 32-bit mode with paging disabled. This means not only that the
* code itself must be running from 32-bit addressable physical memory, but
* also that the root page table must be 32-bit addressable, as programming
* a 64-bit value into CR3 when running in 32-bit mode is not supported.
*/
efi_status_t efi_setup_5level_paging(void)
{
u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
efi_status_t status;
u8 *la57_code;
if (!efi_is_64bit())
return EFI_SUCCESS;
/* check for 5 level paging support */
if (native_cpuid_eax(0) < 7 ||
!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
return EFI_SUCCESS;
/* allocate some 32-bit addressable memory for code and a page table */
status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
U32_MAX);
if (status != EFI_SUCCESS)
return status;
la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size);
memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
/*
* To avoid the need to allocate a 32-bit addressable stack, the
* trampoline uses a LJMP instruction to switch back to long mode.
* LJMP takes an absolute destination address, which needs to be
* fixed up at runtime.
*/
*(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code;
efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE);
return EFI_SUCCESS;
}
void efi_5level_switch(void)
{
bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
bool have_la57 = native_read_cr4() & X86_CR4_LA57;
bool need_toggle = want_la57 ^ have_la57;
u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
u64 *cr3 = (u64 *)__native_read_cr3();
u64 *new_cr3;
if (!la57_toggle || !need_toggle)
return;
if (!have_la57) {
/*
* 5 level paging will be enabled, so a root level page needs
* to be allocated from the 32-bit addressable physical region,
* with its first entry referring to the existing hierarchy.
*/
new_cr3 = memset(pgt, 0, PAGE_SIZE);
new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC;
} else {
/* take the new root table pointer from the current entry #0 */
new_cr3 = (u64 *)(cr3[0] & PAGE_MASK);
/* copy the new root table if it is not 32-bit addressable */
if ((u64)new_cr3 > U32_MAX)
new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE);
}
native_load_gdt(&(struct desc_ptr){ sizeof(gdt) - 1, (u64)gdt });
la57_toggle(new_cr3);
}

View file

@ -15,16 +15,16 @@
#include <asm/setup.h> #include <asm/setup.h>
#include <asm/desc.h> #include <asm/desc.h>
#include <asm/boot.h> #include <asm/boot.h>
#include <asm/kaslr.h>
#include <asm/sev.h>
#include "efistub.h" #include "efistub.h"
#include "x86-stub.h"
/* Maximum physical address for 64-bit kernel with 4-level paging */
#define MAXMEM_X86_64_4LEVEL (1ull << 46)
const efi_system_table_t *efi_system_table; const efi_system_table_t *efi_system_table;
const efi_dxe_services_table_t *efi_dxe_table; const efi_dxe_services_table_t *efi_dxe_table;
u32 image_offset __section(".data");
static efi_loaded_image_t *image = NULL; static efi_loaded_image_t *image = NULL;
static efi_memory_attribute_protocol_t *memattr;
typedef union sev_memory_acceptance_protocol sev_memory_acceptance_protocol_t; typedef union sev_memory_acceptance_protocol sev_memory_acceptance_protocol_t;
union sev_memory_acceptance_protocol { union sev_memory_acceptance_protocol {
@ -223,8 +223,8 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
} }
} }
static void void efi_adjust_memory_range_protection(unsigned long start,
adjust_memory_range_protection(unsigned long start, unsigned long size) unsigned long size)
{ {
efi_status_t status; efi_status_t status;
efi_gcd_memory_space_desc_t desc; efi_gcd_memory_space_desc_t desc;
@ -232,12 +232,18 @@ adjust_memory_range_protection(unsigned long start, unsigned long size)
unsigned long rounded_start, rounded_end; unsigned long rounded_start, rounded_end;
unsigned long unprotect_start, unprotect_size; unsigned long unprotect_start, unprotect_size;
if (efi_dxe_table == NULL)
return;
rounded_start = rounddown(start, EFI_PAGE_SIZE); rounded_start = rounddown(start, EFI_PAGE_SIZE);
rounded_end = roundup(start + size, EFI_PAGE_SIZE); rounded_end = roundup(start + size, EFI_PAGE_SIZE);
if (memattr != NULL) {
efi_call_proto(memattr, clear_memory_attributes, rounded_start,
rounded_end - rounded_start, EFI_MEMORY_XP);
return;
}
if (efi_dxe_table == NULL)
return;
/* /*
* Don't modify memory region attributes, they are * Don't modify memory region attributes, they are
* already suitable, to lower the possibility to * already suitable, to lower the possibility to
@ -278,49 +284,6 @@ adjust_memory_range_protection(unsigned long start, unsigned long size)
} }
} }
/*
* Trampoline takes 2 pages and can be loaded in first megabyte of memory
* with its end placed between 128k and 640k where BIOS might start.
* (see arch/x86/boot/compressed/pgtable_64.c)
*
* We cannot find exact trampoline placement since memory map
* can be modified by UEFI, and it can alter the computed address.
*/
#define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024)
#define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024)
void startup_32(struct boot_params *boot_params);
static void
setup_memory_protection(unsigned long image_base, unsigned long image_size)
{
/*
* Allow execution of possible trampoline used
* for switching between 4- and 5-level page tables
* and relocated kernel image.
*/
adjust_memory_range_protection(TRAMPOLINE_PLACEMENT_BASE,
TRAMPOLINE_PLACEMENT_SIZE);
#ifdef CONFIG_64BIT
if (image_base != (unsigned long)startup_32)
adjust_memory_range_protection(image_base, image_size);
#else
/*
* Clear protection flags on a whole range of possible
* addresses used for KASLR. We don't need to do that
* on x86_64, since KASLR/extraction is performed after
* dedicated identity page tables are built and we only
* need to remove possible protection on relocated image
* itself disregarding further relocations.
*/
adjust_memory_range_protection(LOAD_PHYSICAL_ADDR,
KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR);
#endif
}
static void setup_unaccepted_memory(void) static void setup_unaccepted_memory(void)
{ {
efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID; efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID;
@ -346,9 +309,7 @@ static void setup_unaccepted_memory(void)
static const efi_char16_t apple[] = L"Apple"; static const efi_char16_t apple[] = L"Apple";
static void setup_quirks(struct boot_params *boot_params, static void setup_quirks(struct boot_params *boot_params)
unsigned long image_base,
unsigned long image_size)
{ {
efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long)
efi_table_attr(efi_system_table, fw_vendor); efi_table_attr(efi_system_table, fw_vendor);
@ -357,9 +318,6 @@ static void setup_quirks(struct boot_params *boot_params,
if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) if (IS_ENABLED(CONFIG_APPLE_PROPERTIES))
retrieve_apple_device_properties(boot_params); retrieve_apple_device_properties(boot_params);
} }
if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES))
setup_memory_protection(image_base, image_size);
} }
/* /*
@ -512,7 +470,6 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
} }
image_base = efi_table_attr(image, image_base); image_base = efi_table_attr(image, image_base);
image_offset = (void *)startup_32 - image_base;
status = efi_allocate_pages(sizeof(struct boot_params), status = efi_allocate_pages(sizeof(struct boot_params),
(unsigned long *)&boot_params, ULONG_MAX); (unsigned long *)&boot_params, ULONG_MAX);
@ -803,19 +760,96 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
return EFI_SUCCESS; return EFI_SUCCESS;
} }
/* static bool have_unsupported_snp_features(void)
* On success, we return the address of startup_32, which has potentially been
* relocated by efi_relocate_kernel.
* On failure, we exit to the firmware via efi_exit instead of returning.
*/
asmlinkage unsigned long efi_main(efi_handle_t handle,
efi_system_table_t *sys_table_arg,
struct boot_params *boot_params)
{ {
unsigned long bzimage_addr = (unsigned long)startup_32; u64 unsupported;
unsigned long buffer_start, buffer_end;
unsupported = snp_get_unsupported_features(sev_get_status());
if (unsupported) {
efi_err("Unsupported SEV-SNP features detected: 0x%llx\n",
unsupported);
return true;
}
return false;
}
static void efi_get_seed(void *seed, int size)
{
efi_get_random_bytes(size, seed);
/*
* This only updates seed[0] when running on 32-bit, but in that case,
* seed[1] is not used anyway, as there is no virtual KASLR on 32-bit.
*/
*(unsigned long *)seed ^= kaslr_get_random_long("EFI");
}
static void error(char *str)
{
efi_warn("Decompression failed: %s\n", str);
}
static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry)
{
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
unsigned long addr, alloc_size, entry;
efi_status_t status;
u32 seed[2] = {};
/* determine the required size of the allocation */
alloc_size = ALIGN(max_t(unsigned long, output_len, kernel_total_size),
MIN_KERNEL_ALIGN);
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && !efi_nokaslr) {
u64 range = KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR - kernel_total_size;
efi_get_seed(seed, sizeof(seed));
virt_addr += (range * seed[1]) >> 32;
virt_addr &= ~(CONFIG_PHYSICAL_ALIGN - 1);
}
status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr,
seed[0], EFI_LOADER_CODE,
EFI_X86_KERNEL_ALLOC_LIMIT);
if (status != EFI_SUCCESS)
return status;
entry = decompress_kernel((void *)addr, virt_addr, error);
if (entry == ULONG_MAX) {
efi_free(alloc_size, addr);
return EFI_LOAD_ERROR;
}
*kernel_entry = addr + entry;
efi_adjust_memory_range_protection(addr, kernel_total_size);
return EFI_SUCCESS;
}
static void __noreturn enter_kernel(unsigned long kernel_addr,
struct boot_params *boot_params)
{
/* enter decompressed kernel with boot_params pointer in RSI/ESI */
asm("jmp *%0"::"r"(kernel_addr), "S"(boot_params));
unreachable();
}
/*
* On success, this routine will jump to the relocated image directly and never
* return. On failure, it will exit to the firmware via efi_exit() instead of
* returning.
*/
void __noreturn efi_stub_entry(efi_handle_t handle,
efi_system_table_t *sys_table_arg,
struct boot_params *boot_params)
{
efi_guid_t guid = EFI_MEMORY_ATTRIBUTE_PROTOCOL_GUID;
struct setup_header *hdr = &boot_params->hdr; struct setup_header *hdr = &boot_params->hdr;
const struct linux_efi_initrd *initrd = NULL; const struct linux_efi_initrd *initrd = NULL;
unsigned long kernel_entry;
efi_status_t status; efi_status_t status;
efi_system_table = sys_table_arg; efi_system_table = sys_table_arg;
@ -823,65 +857,25 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
efi_exit(handle, EFI_INVALID_PARAMETER); efi_exit(handle, EFI_INVALID_PARAMETER);
efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID); if (have_unsupported_snp_features())
if (efi_dxe_table && efi_exit(handle, EFI_UNSUPPORTED);
efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) {
efi_warn("Ignoring DXE services table: invalid signature\n"); if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) {
efi_dxe_table = NULL; efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID);
if (efi_dxe_table &&
efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) {
efi_warn("Ignoring DXE services table: invalid signature\n");
efi_dxe_table = NULL;
}
} }
/* /* grab the memory attributes protocol if it exists */
* If the kernel isn't already loaded at a suitable address, efi_bs_call(locate_protocol, &guid, NULL, (void **)&memattr);
* relocate it.
*
* It must be loaded above LOAD_PHYSICAL_ADDR.
*
* The maximum address for 64-bit is 1 << 46 for 4-level paging. This
* is defined as the macro MAXMEM, but unfortunately that is not a
* compile-time constant if 5-level paging is configured, so we instead
* define our own macro for use here.
*
* For 32-bit, the maximum address is complicated to figure out, for
* now use KERNEL_IMAGE_SIZE, which will be 512MiB, the same as what
* KASLR uses.
*
* Also relocate it if image_offset is zero, i.e. the kernel wasn't
* loaded by LoadImage, but rather by a bootloader that called the
* handover entry. The reason we must always relocate in this case is
* to handle the case of systemd-boot booting a unified kernel image,
* which is a PE executable that contains the bzImage and an initrd as
* COFF sections. The initrd section is placed after the bzImage
* without ensuring that there are at least init_size bytes available
* for the bzImage, and thus the compressed kernel's startup code may
* overwrite the initrd unless it is moved out of the way.
*/
buffer_start = ALIGN(bzimage_addr - image_offset, status = efi_setup_5level_paging();
hdr->kernel_alignment); if (status != EFI_SUCCESS) {
buffer_end = buffer_start + hdr->init_size; efi_err("efi_setup_5level_paging() failed!\n");
goto fail;
if ((buffer_start < LOAD_PHYSICAL_ADDR) ||
(IS_ENABLED(CONFIG_X86_32) && buffer_end > KERNEL_IMAGE_SIZE) ||
(IS_ENABLED(CONFIG_X86_64) && buffer_end > MAXMEM_X86_64_4LEVEL) ||
(image_offset == 0)) {
extern char _bss[];
status = efi_relocate_kernel(&bzimage_addr,
(unsigned long)_bss - bzimage_addr,
hdr->init_size,
hdr->pref_address,
hdr->kernel_alignment,
LOAD_PHYSICAL_ADDR);
if (status != EFI_SUCCESS) {
efi_err("efi_relocate_kernel() failed!\n");
goto fail;
}
/*
* Now that we've copied the kernel elsewhere, we no longer
* have a set up block before startup_32(), so reset image_offset
* to zero in case it was set earlier.
*/
image_offset = 0;
} }
#ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_BOOL
@ -901,6 +895,12 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
} }
} }
status = efi_decompress_kernel(&kernel_entry);
if (status != EFI_SUCCESS) {
efi_err("Failed to decompress kernel\n");
goto fail;
}
/* /*
* At this point, an initrd may already have been loaded by the * At this point, an initrd may already have been loaded by the
* bootloader and passed via bootparams. We permit an initrd loaded * bootloader and passed via bootparams. We permit an initrd loaded
@ -940,7 +940,7 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
setup_efi_pci(boot_params); setup_efi_pci(boot_params);
setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start); setup_quirks(boot_params);
setup_unaccepted_memory(); setup_unaccepted_memory();
@ -950,9 +950,38 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
goto fail; goto fail;
} }
return bzimage_addr; /*
* Call the SEV init code while still running with the firmware's
* GDT/IDT, so #VC exceptions will be handled by EFI.
*/
sev_enable(boot_params);
efi_5level_switch();
enter_kernel(kernel_entry, boot_params);
fail: fail:
efi_err("efi_main() failed!\n"); efi_err("efi_stub_entry() failed!\n");
efi_exit(handle, status); efi_exit(handle, status);
} }
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
void efi_handover_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
struct boot_params *boot_params)
{
extern char _bss[], _ebss[];
memset(_bss, 0, _ebss - _bss);
efi_stub_entry(handle, sys_table_arg, boot_params);
}
#ifndef CONFIG_EFI_MIXED
extern __alias(efi_handover_entry)
void efi32_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
struct boot_params *boot_params);
extern __alias(efi_handover_entry)
void efi64_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
struct boot_params *boot_params);
#endif
#endif

View file

@ -0,0 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/efi.h>
extern void trampoline_32bit_src(void *, bool);
extern const u16 trampoline_ljmp_imm_offset;
void efi_adjust_memory_range_protection(unsigned long start,
unsigned long size);
#ifdef CONFIG_X86_64
efi_status_t efi_setup_5level_paging(void);
void efi_5level_switch(void);
#else
static inline efi_status_t efi_setup_5level_paging(void) { return EFI_SUCCESS; }
static inline void efi_5level_switch(void) {}
#endif

View file

@ -119,7 +119,7 @@ efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab)
} }
status = efi_random_alloc(alloc_size, min_kimg_align, &image_base, status = efi_random_alloc(alloc_size, min_kimg_align, &image_base,
seed, EFI_LOADER_CODE); seed, EFI_LOADER_CODE, EFI_ALLOC_LIMIT);
if (status != EFI_SUCCESS) { if (status != EFI_SUCCESS) {
efi_err("Failed to allocate memory\n"); efi_err("Failed to allocate memory\n");
goto free_cmdline; goto free_cmdline;

View file

@ -48,7 +48,7 @@ MALLOC_VISIBLE void *malloc(int size)
if (!malloc_ptr) if (!malloc_ptr)
malloc_ptr = free_mem_ptr; malloc_ptr = free_mem_ptr;
malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */ malloc_ptr = (malloc_ptr + 7) & ~7; /* Align */
p = (void *)malloc_ptr; p = (void *)malloc_ptr;
malloc_ptr += size; malloc_ptr += size;