- Avoid the baremetal decompressor code when booting on an EFI machine.

This is mandated by the current tightening of EFI executables
   requirements when used in a secure boot scenario. More specifically,
   an EFI executable cannot have a single section with RWX permissions,
   which conflicts with the in-place kernel decompression that is done
   today. Instead, the things required by the booting kernel image are
   done in the EFI stub now. Work by Ard Biesheuvel.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmTsMGAACgkQEsHwGGHe
 VUqqGg/+LJq7dapnyMiqafO9Xdpm+h7wTylChdfev7UB3wri9IJGXXFkwZZUNF8Z
 p2c/ECk75CG+z7ZSv+WirZLWlmlBT1D51YjI1lHmg3Vmh3YXZWJHkwaG1DI4euL7
 C4gvd/gmpXGMOxjZLV/D5d7wwoO2REyVkDeCy0lqHCisvd4GRXqUJHe6hJ3GxVv/
 VqbT7ZHMP4BlbWK4/yTq9+wKSC9T0OfAOW2b+K/SAh19TpRqFBpkMOxaZx0nj2Eh
 UxQUd2hntGtEGpPf71HPQMYCkWyvJD1LpTYfwld7tNvzXSt2xg2YZAf7WzJ0Od3b
 kxg2mItHhVRCcwEc9OWlphvgAYCzYECzeMniesLFeJYLMvHNtORzxNPkDLrOYjT6
 0K05DEWJlEqZhIWtaKJ/SSCj7JnfmgubfK3tG3cerCelQZIvUix6ysvDBwPnqdUr
 3O1d7iMlM1UtqMo0GLeIp0q1aRKMsNfRtUvqjaiQ2NjSuyJqZl8t0jfKd6DHcnuy
 WidBMhREULa5qYGE9dxosdTJK1wOhqomTmBSC8EeDUPevLV+DNQUn+oBOVhFvJVk
 gpmsr2mi9+hwC9iNh1K4wslLl9Jc/BlIY0wwC0ysh6eTWQfrvpfXrtHRpmTvwcVW
 g2lt+shYiPDa7q3bvqQL7iqFU6bkMGCs89EFGJEpNbBO076uNZY=
 =2Oej
 -----END PGP SIGNATURE-----

Merge tag 'x86_boot_for_v6.6_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 boot updates from Borislav Petkov:
 "Avoid the baremetal decompressor code when booting on an EFI machine.

  This is mandated by the current tightening of EFI executables
  requirements when used in a secure boot scenario. More specifically,
  an EFI executable cannot have a single section with RWX permissions,
  which conflicts with the in-place kernel decompression that is done
  today.

  Instead, the things required by the booting kernel image are done in
  the EFI stub now.

  Work by Ard Biesheuvel"

* tag 'x86_boot_for_v6.6_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (23 commits)
  x86/efistub: Avoid legacy decompressor when doing EFI boot
  x86/efistub: Perform SNP feature test while running in the firmware
  efi/libstub: Add limit argument to efi_random_alloc()
  x86/decompressor: Factor out kernel decompression and relocation
  x86/decompressor: Move global symbol references to C code
  decompress: Use 8 byte alignment
  x86/efistub: Prefer EFI memory attributes protocol over DXE services
  x86/efistub: Perform 4/5 level paging switch from the stub
  x86/decompressor: Merge trampoline cleanup with switching code
  x86/decompressor: Pass pgtable address to trampoline directly
  x86/decompressor: Only call the trampoline when changing paging levels
  x86/decompressor: Call trampoline directly from C code
  x86/decompressor: Avoid the need for a stack in the 32-bit trampoline
  x86/decompressor: Use standard calling convention for trampoline
  x86/decompressor: Call trampoline as a normal function
  x86/decompressor: Assign paging related global variables earlier
  x86/decompressor: Store boot_params pointer in callee save register
  x86/efistub: Clear BSS in EFI handover protocol entrypoint
  x86/decompressor: Avoid magic offsets for EFI handover entrypoint
  x86/efistub: Simplify and clean up handover entry code
  ...
This commit is contained in:
Linus Torvalds 2023-08-28 15:15:37 -07:00
commit bd9e99f790
24 changed files with 589 additions and 564 deletions

View File

@ -1417,7 +1417,7 @@ execution context provided by the EFI firmware.
The function prototype for the handover entry point looks like this::
efi_main(void *handle, efi_system_table_t *table, struct boot_params *bp)
efi_stub_entry(void *handle, efi_system_table_t *table, struct boot_params *bp)
'handle' is the EFI image handle passed to the boot loader by the EFI
firmware, 'table' is the EFI system table - these are the first two

View File

@ -74,6 +74,11 @@ LDFLAGS_vmlinux += -z noexecstack
ifeq ($(CONFIG_LD_IS_BFD),y)
LDFLAGS_vmlinux += $(call ld-option,--no-warn-rwx-segments)
endif
ifeq ($(CONFIG_EFI_STUB),y)
# ensure that the static EFI stub library will be pulled in, even if it is
# never referenced explicitly from the startup code
LDFLAGS_vmlinux += -u efi_pe_entry
endif
LDFLAGS_vmlinux += -T
hostprogs := mkpiggy

View File

@ -26,8 +26,8 @@
* When booting in 64-bit mode on 32-bit EFI firmware, startup_64_mixed_mode()
* is the first thing that runs after switching to long mode. Depending on
* whether the EFI handover protocol or the compat entry point was used to
* enter the kernel, it will either branch to the 64-bit EFI handover
* entrypoint at offset 0x390 in the image, or to the 64-bit EFI PE/COFF
* enter the kernel, it will either branch to the common 64-bit EFI stub
* entrypoint efi_stub_entry() directly, or via the 64-bit EFI PE/COFF
* entrypoint efi_pe_entry(). In the former case, the bootloader must provide a
* struct bootparams pointer as the third argument, so the presence of such a
* pointer is used to disambiguate.
@ -37,21 +37,23 @@
* | efi32_pe_entry |---->| | | +-----------+--+
* +------------------+ | | +------+----------------+ |
* | startup_32 |---->| startup_64_mixed_mode | |
* +------------------+ | | +------+----------------+ V
* | efi32_stub_entry |---->| | | +------------------+
* +------------------+ +------------+ +---->| efi64_stub_entry |
* +-------------+----+
* +------------+ +----------+ |
* | startup_64 |<----| efi_main |<--------------+
* +------------+ +----------+
* +------------------+ | | +------+----------------+ |
* | efi32_stub_entry |---->| | | |
* +------------------+ +------------+ | |
* V |
* +------------+ +----------------+ |
* | startup_64 |<----| efi_stub_entry |<--------+
* +------------+ +----------------+
*/
SYM_FUNC_START(startup_64_mixed_mode)
lea efi32_boot_args(%rip), %rdx
mov 0(%rdx), %edi
mov 4(%rdx), %esi
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
mov 8(%rdx), %edx // saved bootparams pointer
test %edx, %edx
jnz efi64_stub_entry
jnz efi_stub_entry
#endif
/*
* efi_pe_entry uses MS calling convention, which requires 32 bytes of
* shadow space on the stack even if all arguments are passed in
@ -138,6 +140,28 @@ SYM_FUNC_START(__efi64_thunk)
SYM_FUNC_END(__efi64_thunk)
.code32
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
SYM_FUNC_START(efi32_stub_entry)
call 1f
1: popl %ecx
/* Clear BSS */
xorl %eax, %eax
leal (_bss - 1b)(%ecx), %edi
leal (_ebss - 1b)(%ecx), %ecx
subl %edi, %ecx
shrl $2, %ecx
cld
rep stosl
add $0x4, %esp /* Discard return address */
popl %ecx
popl %edx
popl %esi
jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
/*
* EFI service pointer must be in %edi.
*
@ -218,7 +242,7 @@ SYM_FUNC_END(efi_enter32)
* stub may still exit and return to the firmware using the Exit() EFI boot
* service.]
*/
SYM_FUNC_START(efi32_entry)
SYM_FUNC_START_LOCAL(efi32_entry)
call 1f
1: pop %ebx
@ -245,10 +269,6 @@ SYM_FUNC_START(efi32_entry)
jmp startup_32
SYM_FUNC_END(efi32_entry)
#define ST32_boottime 60 // offsetof(efi_system_table_32_t, boottime)
#define BS32_handle_protocol 88 // offsetof(efi_boot_services_32_t, handle_protocol)
#define LI32_image_base 32 // offsetof(efi_loaded_image_32_t, image_base)
/*
* efi_status_t efi32_pe_entry(efi_handle_t image_handle,
* efi_system_table_32_t *sys_table)
@ -256,8 +276,6 @@ SYM_FUNC_END(efi32_entry)
SYM_FUNC_START(efi32_pe_entry)
pushl %ebp
movl %esp, %ebp
pushl %eax // dummy push to allocate loaded_image
pushl %ebx // save callee-save registers
pushl %edi
@ -266,48 +284,8 @@ SYM_FUNC_START(efi32_pe_entry)
movl $0x80000003, %eax // EFI_UNSUPPORTED
jnz 2f
call 1f
1: pop %ebx
/* Get the loaded image protocol pointer from the image handle */
leal -4(%ebp), %eax
pushl %eax // &loaded_image
leal (loaded_image_proto - 1b)(%ebx), %eax
pushl %eax // pass the GUID address
pushl 8(%ebp) // pass the image handle
/*
* Note the alignment of the stack frame.
* sys_table
* handle <-- 16-byte aligned on entry by ABI
* return address
* frame pointer
* loaded_image <-- local variable
* saved %ebx <-- 16-byte aligned here
* saved %edi
* &loaded_image
* &loaded_image_proto
* handle <-- 16-byte aligned for call to handle_protocol
*/
movl 12(%ebp), %eax // sys_table
movl ST32_boottime(%eax), %eax // sys_table->boottime
call *BS32_handle_protocol(%eax) // sys_table->boottime->handle_protocol
addl $12, %esp // restore argument space
testl %eax, %eax
jnz 2f
movl 8(%ebp), %ecx // image_handle
movl 12(%ebp), %edx // sys_table
movl -4(%ebp), %esi // loaded_image
movl LI32_image_base(%esi), %esi // loaded_image->image_base
leal (startup_32 - 1b)(%ebx), %ebp // runtime address of startup_32
/*
* We need to set the image_offset variable here since startup_32() will
* use it before we get to the 64-bit efi_pe_entry() in C code.
*/
subl %esi, %ebp // calculate image_offset
movl %ebp, (image_offset - 1b)(%ebx) // save image_offset
xorl %esi, %esi
jmp efi32_entry // pass %ecx, %edx, %esi
// no other registers remain live
@ -318,14 +296,13 @@ SYM_FUNC_START(efi32_pe_entry)
RET
SYM_FUNC_END(efi32_pe_entry)
.section ".rodata"
/* EFI loaded image protocol GUID */
.balign 4
SYM_DATA_START_LOCAL(loaded_image_proto)
.long 0x5b1b31a1
.word 0x9562, 0x11d2
.byte 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b
SYM_DATA_END(loaded_image_proto)
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
.org efi32_stub_entry + 0x200
.code64
SYM_FUNC_START_NOALIGN(efi64_stub_entry)
jmp efi_handover_entry
SYM_FUNC_END(efi64_stub_entry)
#endif
.data
.balign 8

View File

@ -84,19 +84,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE
leal startup_32@GOTOFF(%edx), %ebx
#ifdef CONFIG_EFI_STUB
/*
* If we were loaded via the EFI LoadImage service, startup_32() will be at an
* offset to the start of the space allocated for the image. efi_pe_entry() will
* set up image_offset to tell us where the image actually starts, so that we
* can use the full available buffer.
* image_offset = startup_32 - image_base
* Otherwise image_offset will be zero and has no effect on the calculations.
*/
subl image_offset@GOTOFF(%edx), %ebx
#endif
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
@ -150,17 +137,6 @@ SYM_FUNC_START(startup_32)
jmp *%eax
SYM_FUNC_END(startup_32)
#ifdef CONFIG_EFI_STUB
SYM_FUNC_START(efi32_stub_entry)
add $0x4, %esp
movl 8(%esp), %esi /* save boot_params pointer */
call efi_main
/* efi_main returns the possibly relocated address of startup_32 */
jmp *%eax
SYM_FUNC_END(efi32_stub_entry)
SYM_FUNC_ALIAS(efi_stub_entry, efi32_stub_entry)
#endif
.text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@ -179,13 +155,7 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
*/
/* push arguments for extract_kernel: */
pushl output_len@GOTOFF(%ebx) /* decompressed length, end of relocs */
pushl %ebp /* output address */
pushl input_len@GOTOFF(%ebx) /* input_len */
leal input_data@GOTOFF(%ebx), %eax
pushl %eax /* input_data */
leal boot_heap@GOTOFF(%ebx), %eax
pushl %eax /* heap area */
pushl %esi /* real mode pointer */
call extract_kernel /* returns kernel entry point in %eax */
addl $24, %esp
@ -213,8 +183,6 @@ SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
*/
.bss
.balign 4
boot_heap:
.fill BOOT_HEAP_SIZE, 1, 0
boot_stack:
.fill BOOT_STACK_SIZE, 1, 0
boot_stack_end:

View File

@ -146,19 +146,6 @@ SYM_FUNC_START(startup_32)
#ifdef CONFIG_RELOCATABLE
movl %ebp, %ebx
#ifdef CONFIG_EFI_STUB
/*
* If we were loaded via the EFI LoadImage service, startup_32 will be at an
* offset to the start of the space allocated for the image. efi_pe_entry will
* set up image_offset to tell us where the image actually starts, so that we
* can use the full available buffer.
* image_offset = startup_32 - image_base
* Otherwise image_offset will be zero and has no effect on the calculations.
*/
subl rva(image_offset)(%ebp), %ebx
#endif
movl BP_kernel_alignment(%esi), %eax
decl %eax
addl %eax, %ebx
@ -294,17 +281,6 @@ SYM_FUNC_START(startup_32)
lret
SYM_FUNC_END(startup_32)
#if IS_ENABLED(CONFIG_EFI_MIXED) && IS_ENABLED(CONFIG_EFI_HANDOVER_PROTOCOL)
.org 0x190
SYM_FUNC_START(efi32_stub_entry)
add $0x4, %esp /* Discard return address */
popl %ecx
popl %edx
popl %esi
jmp efi32_entry
SYM_FUNC_END(efi32_stub_entry)
#endif
.code64
.org 0x200
SYM_CODE_START(startup_64)
@ -346,20 +322,6 @@ SYM_CODE_START(startup_64)
/* Start with the delta to where the kernel will run at. */
#ifdef CONFIG_RELOCATABLE
leaq startup_32(%rip) /* - $startup_32 */, %rbp
#ifdef CONFIG_EFI_STUB
/*
* If we were loaded via the EFI LoadImage service, startup_32 will be at an
* offset to the start of the space allocated for the image. efi_pe_entry will
* set up image_offset to tell us where the image actually starts, so that we
* can use the full available buffer.
* image_offset = startup_32 - image_base
* Otherwise image_offset will be zero and has no effect on the calculations.
*/
movl image_offset(%rip), %eax
subq %rax, %rbp
#endif
movl BP_kernel_alignment(%rsi), %eax
decl %eax
addq %rax, %rbp
@ -398,10 +360,6 @@ SYM_CODE_START(startup_64)
* For the trampoline, we need the top page table to reside in lower
* memory as we don't have a way to load 64-bit values into CR3 in
* 32-bit mode.
*
* We go though the trampoline even if we don't have to: if we're
* already in a desired paging mode. This way the trampoline code gets
* tested on every boot.
*/
/* Make sure we have GDT with 32-bit code segment */
@ -416,10 +374,14 @@ SYM_CODE_START(startup_64)
lretq
.Lon_kernel_cs:
/*
* RSI holds a pointer to a boot_params structure provided by the
* loader, and this needs to be preserved across C function calls. So
* move it into a callee saved register.
*/
movq %rsi, %r15
pushq %rsi
call load_stage1_idt
popq %rsi
#ifdef CONFIG_AMD_MEM_ENCRYPT
/*
@ -430,63 +392,24 @@ SYM_CODE_START(startup_64)
* CPUID instructions being issued, so go ahead and do that now via
* sev_enable(), which will also handle the rest of the SEV-related
* detection/setup to ensure that has been done in advance of any dependent
* code.
* code. Pass the boot_params pointer as the first argument.
*/
pushq %rsi
movq %rsi, %rdi /* real mode address */
movq %r15, %rdi
call sev_enable
popq %rsi
#endif
/*
* paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging.
* configure_5level_paging() updates the number of paging levels using
* a trampoline in 32-bit addressable memory if the current number does
* not match the desired number.
*
* paging_prepare() returns a two-quadword structure which lands
* into RDX:RAX:
* - Address of the trampoline is returned in RAX.
* - Non zero RDX means trampoline needs to enable 5-level
* paging.
*
* RSI holds real mode data and needs to be preserved across
* this function call.
* Pass the boot_params pointer as the first argument. The second
* argument is the relocated address of the page table to use instead
* of the page table in trampoline memory (if required).
*/
pushq %rsi
movq %rsi, %rdi /* real mode address */
call paging_prepare
popq %rsi
/* Save the trampoline address in RCX */
movq %rax, %rcx
/*
* Load the address of trampoline_return() into RDI.
* It will be used by the trampoline to return to the main code.
*/
leaq trampoline_return(%rip), %rdi
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax
pushq %rax
lretq
trampoline_return:
/* Restore the stack, the 32-bit trampoline uses its own stack */
leaq rva(boot_stack_end)(%rbx), %rsp
/*
* cleanup_trampoline() would restore trampoline memory.
*
* RDI is address of the page table to use instead of page table
* in trampoline memory (if required).
*
* RSI holds real mode data and needs to be preserved across
* this function call.
*/
pushq %rsi
leaq rva(top_pgtable)(%rbx), %rdi
call cleanup_trampoline
popq %rsi
movq %r15, %rdi
leaq rva(top_pgtable)(%rbx), %rsi
call configure_5level_paging
/* Zero EFLAGS */
pushq $0
@ -496,7 +419,6 @@ trampoline_return:
* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
pushq %rsi
leaq (_bss-8)(%rip), %rsi
leaq rva(_bss-8)(%rbx), %rdi
movl $(_bss - startup_32), %ecx
@ -504,7 +426,6 @@ trampoline_return:
std
rep movsq
cld
popq %rsi
/*
* The GDT may get overwritten either during the copy we just did or
@ -523,21 +444,6 @@ trampoline_return:
jmp *%rax
SYM_CODE_END(startup_64)
#ifdef CONFIG_EFI_STUB
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
.org 0x390
#endif
SYM_FUNC_START(efi64_stub_entry)
and $~0xf, %rsp /* realign the stack */
movq %rdx, %rbx /* save boot_params pointer */
call efi_main
movq %rbx,%rsi
leaq rva(startup_64)(%rax), %rax
jmp *%rax
SYM_FUNC_END(efi64_stub_entry)
SYM_FUNC_ALIAS(efi_stub_entry, efi64_stub_entry)
#endif
.text
SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
@ -551,128 +457,122 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
shrq $3, %rcx
rep stosq
pushq %rsi
call load_stage2_idt
/* Pass boot_params to initialize_identity_maps() */
movq (%rsp), %rdi
movq %r15, %rdi
call initialize_identity_maps
popq %rsi
/*
* Do the extraction, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
movq %rsi, %rdi /* real mode address */
leaq boot_heap(%rip), %rsi /* malloc area for uncompression */
leaq input_data(%rip), %rdx /* input_data */
movl input_len(%rip), %ecx /* input_len */
movq %rbp, %r8 /* output target address */
movl output_len(%rip), %r9d /* decompressed length, end of relocs */
/* pass struct boot_params pointer and output target address */
movq %r15, %rdi
movq %rbp, %rsi
call extract_kernel /* returns kernel entry point in %rax */
popq %rsi
/*
* Jump to the decompressed kernel.
*/
movq %r15, %rsi
jmp *%rax
SYM_FUNC_END(.Lrelocated)
.code32
/*
* This is the 32-bit trampoline that will be copied over to low memory.
* This is the 32-bit trampoline that will be copied over to low memory. It
* will be called using the ordinary 64-bit calling convention from code
* running in 64-bit mode.
*
* RDI contains the return address (might be above 4G).
* ECX contains the base address of the trampoline memory.
* Non zero RDX means trampoline needs to enable 5-level paging.
* Return address is at the top of the stack (might be above 4G).
* The first argument (EDI) contains the address of the temporary PGD level
* page table in 32-bit addressable memory which will be programmed into
* register CR3.
*/
.section ".rodata", "a", @progbits
SYM_CODE_START(trampoline_32bit_src)
/* Set up data and stack segments */
movl $__KERNEL_DS, %eax
movl %eax, %ds
movl %eax, %ss
/*
* Preserve callee save 64-bit registers on the stack: this is
* necessary because the architecture does not guarantee that GPRs will
* retain their full 64-bit values across a 32-bit mode switch.
*/
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbp
pushq %rbx
/* Set up new stack */
leal TRAMPOLINE_32BIT_STACK_END(%ecx), %esp
/* Preserve top half of RSP in a legacy mode GPR to avoid truncation */
movq %rsp, %rbx
shrq $32, %rbx
/* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
pushq $__KERNEL32_CS
leaq 0f(%rip), %rax
pushq %rax
lretq
/*
* The 32-bit code below will do a far jump back to long mode and end
* up here after reconfiguring the number of paging levels. First, the
* stack pointer needs to be restored to its full 64-bit value before
* the callee save register contents can be popped from the stack.
*/
.Lret:
shlq $32, %rbx
orq %rbx, %rsp
/* Restore the preserved 64-bit registers */
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
retq
.code32
0:
/* Disable paging */
movl %cr0, %eax
btrl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
/* Check what paging mode we want to be in after the trampoline */
testl %edx, %edx
jz 1f
/* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */
movl %cr4, %eax
testl $X86_CR4_LA57, %eax
jnz 3f
jmp 2f
1:
/* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */
movl %cr4, %eax
testl $X86_CR4_LA57, %eax
jz 3f
2:
/* Point CR3 to the trampoline's new top level page table */
leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax
movl %eax, %cr3
3:
movl %edi, %cr3
/* Set EFER.LME=1 as a precaution in case hypervsior pulls the rug */
pushl %ecx
pushl %edx
movl $MSR_EFER, %ecx
rdmsr
btsl $_EFER_LME, %eax
/* Avoid writing EFER if no change was made (for TDX guest) */
jc 1f
wrmsr
1: popl %edx
popl %ecx
#ifdef CONFIG_X86_MCE
/*
* Preserve CR4.MCE if the kernel will enable #MC support.
* Clearing MCE may fault in some environments (that also force #MC
* support). Any machine check that occurs before #MC support is fully
* configured will crash the system regardless of the CR4.MCE value set
* here.
*/
movl %cr4, %eax
andl $X86_CR4_MCE, %eax
#else
movl $0, %eax
#endif
/* Enable PAE and LA57 (if required) paging modes */
orl $X86_CR4_PAE, %eax
testl %edx, %edx
jz 1f
orl $X86_CR4_LA57, %eax
1:
/* Toggle CR4.LA57 */
movl %cr4, %eax
btcl $X86_CR4_LA57_BIT, %eax
movl %eax, %cr4
/* Calculate address of paging_enabled() once we are executing in the trampoline */
leal .Lpaging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax
/* Prepare the stack for far return to Long Mode */
pushl $__KERNEL_CS
pushl %eax
/* Enable paging again. */
movl %cr0, %eax
btsl $X86_CR0_PG_BIT, %eax
movl %eax, %cr0
lret
/*
* Return to the 64-bit calling code using LJMP rather than LRET, to
* avoid the need for a 32-bit addressable stack. The destination
* address will be adjusted after the template code is copied into a
* 32-bit addressable buffer.
*/
.Ljmp: ljmpl $__KERNEL_CS, $(.Lret - trampoline_32bit_src)
SYM_CODE_END(trampoline_32bit_src)
.code64
SYM_FUNC_START_LOCAL_NOALIGN(.Lpaging_enabled)
/* Return from the trampoline */
jmp *%rdi
SYM_FUNC_END(.Lpaging_enabled)
/*
* This symbol is placed right after trampoline_32bit_src() so its address can
* be used to infer the size of the trampoline code.
*/
SYM_DATA(trampoline_ljmp_imm_offset, .word .Ljmp + 1 - trampoline_32bit_src)
/*
* The trampoline code has a size limit.
@ -681,7 +581,7 @@ SYM_FUNC_END(.Lpaging_enabled)
*/
.org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE
.code32
.text
SYM_FUNC_START_LOCAL_NOALIGN(.Lno_longmode)
/* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */
1:
@ -726,8 +626,6 @@ SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
*/
.bss
.balign 4
SYM_DATA_LOCAL(boot_heap, .fill BOOT_HEAP_SIZE, 1, 0)
SYM_DATA_START_LOCAL(boot_stack)
.fill BOOT_STACK_SIZE, 1, 0
.balign 16

View File

@ -330,6 +330,33 @@ static size_t parse_elf(void *output)
return ehdr.e_entry - LOAD_PHYSICAL_ADDR;
}
const unsigned long kernel_total_size = VO__end - VO__text;
static u8 boot_heap[BOOT_HEAP_SIZE] __aligned(4);
extern unsigned char input_data[];
extern unsigned int input_len, output_len;
unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
void (*error)(char *x))
{
unsigned long entry;
if (!free_mem_ptr) {
free_mem_ptr = (unsigned long)boot_heap;
free_mem_end_ptr = (unsigned long)boot_heap + sizeof(boot_heap);
}
if (__decompress(input_data, input_len, NULL, NULL, outbuf, output_len,
NULL, error) < 0)
return ULONG_MAX;
entry = parse_elf(outbuf);
handle_relocations(outbuf, output_len, virt_addr);
return entry;
}
/*
* The compressed kernel image (ZO), has been moved so that its position
* is against the end of the buffer used to hold the uncompressed kernel
@ -347,14 +374,10 @@ static size_t parse_elf(void *output)
* |-------uncompressed kernel image---------|
*
*/
asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
unsigned char *input_data,
unsigned long input_len,
unsigned char *output,
unsigned long output_len)
asmlinkage __visible void *extract_kernel(void *rmode, unsigned char *output)
{
const unsigned long kernel_total_size = VO__end - VO__text;
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
memptr heap = (memptr)boot_heap;
unsigned long needed_size;
size_t entry_offset;
@ -412,7 +435,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
* entries. This ensures the full mapped area is usable RAM
* and doesn't include any reserved areas.
*/
needed_size = max(output_len, kernel_total_size);
needed_size = max_t(unsigned long, output_len, kernel_total_size);
#ifdef CONFIG_X86_64
needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN);
#endif
@ -443,7 +466,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
#ifdef CONFIG_X86_64
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
if (virt_addr + max(output_len, kernel_total_size) > KERNEL_IMAGE_SIZE)
if (virt_addr + needed_size > KERNEL_IMAGE_SIZE)
error("Destination virtual address is beyond the kernel mapping area");
#else
if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
@ -461,10 +484,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
accept_memory(__pa(output), __pa(output) + needed_size);
}
__decompress(input_data, input_len, NULL, NULL, output, output_len,
NULL, error);
entry_offset = parse_elf(output);
handle_relocations(output, output_len, virt_addr);
entry_offset = decompress_kernel(output, virt_addr, error);
debug_putstr("done.\nBooting the kernel (entry_offset: 0x");
debug_puthex(entry_offset);

View File

@ -179,9 +179,7 @@ static inline int count_immovable_mem_regions(void) { return 0; }
#endif
/* ident_map_64.c */
#ifdef CONFIG_X86_5LEVEL
extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
#endif
extern void kernel_add_identity_map(unsigned long start, unsigned long end);
/* Used by PAGE_KERN* macros: */

View File

@ -3,18 +3,16 @@
#define TRAMPOLINE_32BIT_SIZE (2 * PAGE_SIZE)
#define TRAMPOLINE_32BIT_PGTABLE_OFFSET 0
#define TRAMPOLINE_32BIT_CODE_OFFSET PAGE_SIZE
#define TRAMPOLINE_32BIT_CODE_SIZE 0x80
#define TRAMPOLINE_32BIT_STACK_END TRAMPOLINE_32BIT_SIZE
#define TRAMPOLINE_32BIT_CODE_SIZE 0xA0
#ifndef __ASSEMBLER__
extern unsigned long *trampoline_32bit;
extern void trampoline_32bit_src(void *return_ptr);
extern void trampoline_32bit_src(void *trampoline, bool enable_5lvl);
extern const u16 trampoline_ljmp_imm_offset;
#endif /* __ASSEMBLER__ */
#endif /* BOOT_COMPRESSED_PAGETABLE_H */

View File

@ -16,11 +16,6 @@ unsigned int __section(".data") pgdir_shift = 39;
unsigned int __section(".data") ptrs_per_p4d = 1;
#endif
struct paging_config {
unsigned long trampoline_start;
unsigned long l5_required;
};
/* Buffer to preserve trampoline memory */
static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
@ -29,7 +24,7 @@ static char trampoline_save[TRAMPOLINE_32BIT_SIZE];
* purposes.
*
* Avoid putting the pointer into .bss as it will be cleared between
* paging_prepare() and extract_kernel().
* configure_5level_paging() and extract_kernel().
*/
unsigned long *trampoline_32bit __section(".data");
@ -106,12 +101,13 @@ static unsigned long find_trampoline_placement(void)
return bios_start - TRAMPOLINE_32BIT_SIZE;
}
struct paging_config paging_prepare(void *rmode)
asmlinkage void configure_5level_paging(struct boot_params *bp, void *pgtable)
{
struct paging_config paging_config = {};
void (*toggle_la57)(void *cr3);
bool l5_required = false;
/* Initialize boot_params. Required for cmdline_find_option_bool(). */
boot_params = rmode;
boot_params = bp;
/*
* Check if LA57 is desired and supported.
@ -129,12 +125,22 @@ struct paging_config paging_prepare(void *rmode)
!cmdline_find_option_bool("no5lvl") &&
native_cpuid_eax(0) >= 7 &&
(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31)))) {
paging_config.l5_required = 1;
l5_required = true;
/* Initialize variables for 5-level paging */
__pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
}
paging_config.trampoline_start = find_trampoline_placement();
/*
* The trampoline will not be used if the paging mode is already set to
* the desired one.
*/
if (l5_required == !!(native_read_cr4() & X86_CR4_LA57))
return;
trampoline_32bit = (unsigned long *)paging_config.trampoline_start;
trampoline_32bit = (unsigned long *)find_trampoline_placement();
/* Preserve trampoline memory */
memcpy(trampoline_save, trampoline_32bit, TRAMPOLINE_32BIT_SIZE);
@ -143,32 +149,32 @@ struct paging_config paging_prepare(void *rmode)
memset(trampoline_32bit, 0, TRAMPOLINE_32BIT_SIZE);
/* Copy trampoline code in place */
memcpy(trampoline_32bit + TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
toggle_la57 = memcpy(trampoline_32bit +
TRAMPOLINE_32BIT_CODE_OFFSET / sizeof(unsigned long),
&trampoline_32bit_src, TRAMPOLINE_32BIT_CODE_SIZE);
/*
* Avoid the need for a stack in the 32-bit trampoline code, by using
* LJMP rather than LRET to return back to long mode. LJMP takes an
* immediate absolute address, which needs to be adjusted based on the
* placement of the trampoline.
*/
*(u32 *)((u8 *)toggle_la57 + trampoline_ljmp_imm_offset) +=
(unsigned long)toggle_la57;
/*
* The code below prepares page table in trampoline memory.
*
* The new page table will be used by trampoline code for switching
* from 4- to 5-level paging or vice versa.
*
* If switching is not required, the page table is unused: trampoline
* code wouldn't touch CR3.
*/
/*
* We are not going to use the page table in trampoline memory if we
* are already in the desired paging mode.
*/
if (paging_config.l5_required == !!(native_read_cr4() & X86_CR4_LA57))
goto out;
if (paging_config.l5_required) {
if (l5_required) {
/*
* For 4- to 5-level paging transition, set up current CR3 as
* the first and the only entry in a new top-level page table.
*/
trampoline_32bit[TRAMPOLINE_32BIT_PGTABLE_OFFSET] = __native_read_cr3() | _PAGE_TABLE_NOENC;
*trampoline_32bit = __native_read_cr3() | _PAGE_TABLE_NOENC;
} else {
unsigned long src;
@ -181,38 +187,17 @@ struct paging_config paging_prepare(void *rmode)
* may be above 4G.
*/
src = *(unsigned long *)__native_read_cr3() & PAGE_MASK;
memcpy(trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long),
(void *)src, PAGE_SIZE);
memcpy(trampoline_32bit, (void *)src, PAGE_SIZE);
}
out:
return paging_config;
}
void cleanup_trampoline(void *pgtable)
{
void *trampoline_pgtable;
trampoline_pgtable = trampoline_32bit + TRAMPOLINE_32BIT_PGTABLE_OFFSET / sizeof(unsigned long);
toggle_la57(trampoline_32bit);
/*
* Move the top level page table out of trampoline memory,
* if it's there.
* Move the top level page table out of trampoline memory.
*/
if ((void *)__native_read_cr3() == trampoline_pgtable) {
memcpy(pgtable, trampoline_pgtable, PAGE_SIZE);
native_write_cr3((unsigned long)pgtable);
}
memcpy(pgtable, trampoline_32bit, PAGE_SIZE);
native_write_cr3((unsigned long)pgtable);
/* Restore trampoline memory */
memcpy(trampoline_32bit, trampoline_save, TRAMPOLINE_32BIT_SIZE);
/* Initialize variables for 5-level paging */
#ifdef CONFIG_X86_5LEVEL
if (__read_cr4() & X86_CR4_LA57) {
__pgtable_l5_enabled = 1;
pgdir_shift = 48;
ptrs_per_p4d = 512;
}
#endif
}

View File

@ -367,20 +367,25 @@ static void enforce_vmpl0(void)
*/
#define SNP_FEATURES_PRESENT (0)
u64 snp_get_unsupported_features(u64 status)
{
if (!(status & MSR_AMD64_SEV_SNP_ENABLED))
return 0;
return status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
}
void snp_check_features(void)
{
u64 unsupported;
if (!(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
return;
/*
* Terminate the boot if hypervisor has enabled any feature lacking
* guest side implementation. Pass on the unsupported features mask through
* EXIT_INFO_2 of the GHCB protocol so that those features can be reported
* as part of the guest boot failure.
*/
unsupported = sev_status & SNP_FEATURES_IMPL_REQ & ~SNP_FEATURES_PRESENT;
unsupported = snp_get_unsupported_features(sev_status);
if (unsupported) {
if (ghcb_version < 2 || (!boot_ghcb && !early_setup_ghcb()))
sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED);
@ -390,10 +395,45 @@ void snp_check_features(void)
}
}
void sev_enable(struct boot_params *bp)
/*
* sev_check_cpu_support - Check for SEV support in the CPU capabilities
*
* Returns < 0 if SEV is not supported, otherwise the position of the
* encryption bit in the page table descriptors.
*/
static int sev_check_cpu_support(void)
{
unsigned int eax, ebx, ecx, edx;
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return -ENODEV;
/*
* Check for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
* - Bit 0 - Secure Memory Encryption support
* - Bit 1 - Secure Encrypted Virtualization support
* CPUID Fn8000_001F[EBX]
* - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1)))
return -ENODEV;
return ebx & 0x3f;
}
void sev_enable(struct boot_params *bp)
{
struct msr m;
int bitpos;
bool snp;
/*
@ -413,26 +453,7 @@ void sev_enable(struct boot_params *bp)
* which is good enough.
*/
/* Check for the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return;
/*
* Check for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
* - Bit 0 - Secure Memory Encryption support
* - Bit 1 - Secure Encrypted Virtualization support
* CPUID Fn8000_001F[EBX]
* - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1)))
if (sev_check_cpu_support() < 0)
return;
/*
@ -443,26 +464,8 @@ void sev_enable(struct boot_params *bp)
/* Now repeat the checks with the SNP CPUID table. */
/* Recheck the SME/SEV support leaf */
eax = 0x80000000;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (eax < 0x8000001f)
return;
/*
* Recheck for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
* - Bit 0 - Secure Memory Encryption support
* - Bit 1 - Secure Encrypted Virtualization support
* CPUID Fn8000_001F[EBX]
* - Bits 5:0 - Pagetable bit position used to indicate encryption
*/
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
/* Check whether SEV is supported */
if (!(eax & BIT(1))) {
bitpos = sev_check_cpu_support();
if (bitpos < 0) {
if (snp)
error("SEV-SNP support indicated by CC blob, but not CPUID.");
return;
@ -494,7 +497,24 @@ void sev_enable(struct boot_params *bp)
if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED))
error("SEV-SNP supported indicated by CC blob, but not SEV status MSR.");
sme_me_mask = BIT_ULL(ebx & 0x3f);
sme_me_mask = BIT_ULL(bitpos);
}
/*
* sev_get_status - Retrieve the SEV status mask
*
* Returns 0 if the CPU is not SEV capable, otherwise the value of the
* AMD64_SEV MSR.
*/
u64 sev_get_status(void)
{
struct msr m;
if (sev_check_cpu_support() < 0)
return 0;
boot_rdmsr(MSR_AMD64_SEV, &m);
return m.q;
}
/* Search for Confidential Computing blob in the EFI config table. */

View File

@ -62,4 +62,12 @@
# define BOOT_STACK_SIZE 0x1000
#endif
#ifndef __ASSEMBLY__
extern unsigned int output_len;
extern const unsigned long kernel_total_size;
unsigned long decompress_kernel(unsigned char *outbuf, unsigned long virt_addr,
void (*error)(char *x));
#endif
#endif /* _ASM_X86_BOOT_H */

View File

@ -90,6 +90,8 @@ static inline void efi_fpu_end(void)
}
#ifdef CONFIG_X86_32
#define EFI_X86_KERNEL_ALLOC_LIMIT (SZ_512M - 1)
#define arch_efi_call_virt_setup() \
({ \
efi_fpu_begin(); \
@ -103,8 +105,7 @@ static inline void efi_fpu_end(void)
})
#else /* !CONFIG_X86_32 */
#define EFI_LOADER_SIGNATURE "EL64"
#define EFI_X86_KERNEL_ALLOC_LIMIT EFI_ALLOC_LIMIT
extern asmlinkage u64 __efi_call(void *fp, ...);
@ -218,6 +219,8 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
#ifdef CONFIG_EFI_MIXED
#define EFI_ALLOC_LIMIT (efi_is_64bit() ? ULONG_MAX : U32_MAX)
#define ARCH_HAS_EFISTUB_WRAPPERS
static inline bool efi_is_64bit(void)

View File

@ -164,6 +164,7 @@ static __always_inline void sev_es_nmi_complete(void)
__sev_es_nmi_complete();
}
extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
extern void sev_enable(struct boot_params *bp);
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs)
{
@ -210,12 +211,15 @@ bool snp_init(struct boot_params *bp);
void __init __noreturn snp_abort(void);
int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, struct snp_guest_request_ioctl *rio);
void snp_accept_memory(phys_addr_t start, phys_addr_t end);
u64 snp_get_unsupported_features(u64 status);
u64 sev_get_status(void);
#else
static inline void sev_es_ist_enter(struct pt_regs *regs) { }
static inline void sev_es_ist_exit(void) { }
static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
static inline void sev_es_nmi_complete(void) { }
static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
static inline void sev_enable(struct boot_params *bp) { }
static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; }
static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; }
static inline void setup_ghcb(void) { }
@ -235,6 +239,8 @@ static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *in
}
static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { }
static inline u64 snp_get_unsupported_features(u64 status) { return 0; }
static inline u64 sev_get_status(void) { return 0; }
#endif
#endif

View File

@ -51,7 +51,9 @@ SYM_CODE_START_NOALIGN(startup_64)
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
* %rsi holds a physical pointer to real_mode_data.
* %RSI holds the physical address of the boot_params structure
* provided by the bootloader. Preserve it in %R15 so C function calls
* will not clobber it.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86/boot/compressed/head_64.S.
@ -62,6 +64,7 @@ SYM_CODE_START_NOALIGN(startup_64)
* compiled to run at we first fixup the physical addresses in our page
* tables and then reload them.
*/
mov %rsi, %r15
/* Set up the stack for verify_cpu() */
leaq (__end_init_task - PTREGS_SIZE)(%rip), %rsp
@ -75,9 +78,7 @@ SYM_CODE_START_NOALIGN(startup_64)
shrq $32, %rdx
wrmsr
pushq %rsi
call startup_64_setup_env
popq %rsi
/* Now switch to __KERNEL_CS so IRET works reliably */
pushq $__KERNEL_CS
@ -93,12 +94,10 @@ SYM_CODE_START_NOALIGN(startup_64)
* Activate SEV/SME memory encryption if supported/enabled. This needs to
* be done now, since this also includes setup of the SEV-SNP CPUID table,
* which needs to be done before any CPUID instructions are executed in
* subsequent code.
* subsequent code. Pass the boot_params pointer as the first argument.
*/
movq %rsi, %rdi
pushq %rsi
movq %r15, %rdi
call sme_enable
popq %rsi
#endif
/* Sanitize CPU configuration */
@ -111,9 +110,8 @@ SYM_CODE_START_NOALIGN(startup_64)
* programmed into CR3.
*/
leaq _text(%rip), %rdi
pushq %rsi
movq %r15, %rsi
call __startup_64
popq %rsi
/* Form the CR3 value being sure to include the CR3 modifier */
addq $(early_top_pgt - __START_KERNEL_map), %rax
@ -127,8 +125,6 @@ SYM_CODE_START(secondary_startup_64)
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
*
* %rsi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
*
@ -153,6 +149,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
/* Clear %R15 which holds the boot_params pointer on the boot CPU */
xorq %r15, %r15
/*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
@ -199,13 +198,9 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
* hypervisor could lie about the C-bit position to perform a ROP
* attack on the guest by writing to the unencrypted stack and wait for
* the next RET instruction.
* %rsi carries pointer to realmode data and is callee-clobbered. Save
* and restore it.
*/
pushq %rsi
movq %rax, %rdi
call sev_verify_cbit
popq %rsi
/*
* Switch to new page-table
@ -365,9 +360,7 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
wrmsr
/* Setup and Load IDT */
pushq %rsi
call early_setup_idt
popq %rsi
/* Check if nx is implemented */
movl $0x80000001, %eax
@ -403,9 +396,8 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
pushq $0
popfq
/* rsi is pointer to real mode structure with interesting info.
pass it to C */
movq %rsi, %rdi
/* Pass the boot_params pointer as first argument */
movq %r15, %rdi
.Ljump_to_C_code:
/*

View File

@ -88,6 +88,7 @@ lib-$(CONFIG_EFI_GENERIC_STUB) += efi-stub.o string.o intrinsics.o systable.o \
lib-$(CONFIG_ARM) += arm32-stub.o
lib-$(CONFIG_ARM64) += arm64.o arm64-stub.o smbios.o
lib-$(CONFIG_X86) += x86-stub.o
lib-$(CONFIG_X86_64) += x86-5lvl.o
lib-$(CONFIG_RISCV) += riscv.o riscv-stub.o
lib-$(CONFIG_LOONGARCH) += loongarch.o loongarch-stub.o

View File

@ -106,7 +106,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr,
*/
status = efi_random_alloc(*reserve_size, min_kimg_align,
reserve_addr, phys_seed,
EFI_LOADER_CODE);
EFI_LOADER_CODE, EFI_ALLOC_LIMIT);
if (status != EFI_SUCCESS)
efi_warn("efi_random_alloc() failed: 0x%lx\n", status);
} else {

View File

@ -73,6 +73,8 @@ efi_status_t efi_parse_options(char const *cmdline)
efi_loglevel = CONSOLE_LOGLEVEL_QUIET;
} else if (!strcmp(param, "noinitrd")) {
efi_noinitrd = true;
} else if (IS_ENABLED(CONFIG_X86_64) && !strcmp(param, "no5lvl")) {
efi_no5lvl = true;
} else if (!strcmp(param, "efi") && val) {
efi_nochunk = parse_option_str(val, "nochunk");
efi_novamap |= parse_option_str(val, "novamap");

View File

@ -33,6 +33,7 @@
#define EFI_ALLOC_LIMIT ULONG_MAX
#endif
extern bool efi_no5lvl;
extern bool efi_nochunk;
extern bool efi_nokaslr;
extern int efi_loglevel;
@ -955,7 +956,7 @@ efi_status_t efi_get_random_bytes(unsigned long size, u8 *out);
efi_status_t efi_random_alloc(unsigned long size, unsigned long align,
unsigned long *addr, unsigned long random_seed,
int memory_type);
int memory_type, unsigned long alloc_limit);
efi_status_t efi_random_get_seed(void);

View File

@ -16,7 +16,8 @@
*/
static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
unsigned long size,
unsigned long align_shift)
unsigned long align_shift,
u64 alloc_limit)
{
unsigned long align = 1UL << align_shift;
u64 first_slot, last_slot, region_end;
@ -29,7 +30,7 @@ static unsigned long get_entry_num_slots(efi_memory_desc_t *md,
return 0;
region_end = min(md->phys_addr + md->num_pages * EFI_PAGE_SIZE - 1,
(u64)EFI_ALLOC_LIMIT);
alloc_limit);
if (region_end < size)
return 0;
@ -54,7 +55,8 @@ efi_status_t efi_random_alloc(unsigned long size,
unsigned long align,
unsigned long *addr,
unsigned long random_seed,
int memory_type)
int memory_type,
unsigned long alloc_limit)
{
unsigned long total_slots = 0, target_slot;
unsigned long total_mirrored_slots = 0;
@ -76,7 +78,7 @@ efi_status_t efi_random_alloc(unsigned long size,
efi_memory_desc_t *md = (void *)map->map + map_offset;
unsigned long slots;
slots = get_entry_num_slots(md, size, ilog2(align));
slots = get_entry_num_slots(md, size, ilog2(align), alloc_limit);
MD_NUM_SLOTS(md) = slots;
total_slots += slots;
if (md->attribute & EFI_MEMORY_MORE_RELIABLE)

View File

@ -0,0 +1,95 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/efi.h>
#include <asm/boot.h>
#include <asm/desc.h>
#include <asm/efi.h>
#include "efistub.h"
#include "x86-stub.h"
bool efi_no5lvl;
static void (*la57_toggle)(void *cr3);
static const struct desc_struct gdt[] = {
[GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
[GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
};
/*
* Enabling (or disabling) 5 level paging is tricky, because it can only be
* done from 32-bit mode with paging disabled. This means not only that the
* code itself must be running from 32-bit addressable physical memory, but
* also that the root page table must be 32-bit addressable, as programming
* a 64-bit value into CR3 when running in 32-bit mode is not supported.
*/
efi_status_t efi_setup_5level_paging(void)
{
u8 tmpl_size = (u8 *)&trampoline_ljmp_imm_offset - (u8 *)&trampoline_32bit_src;
efi_status_t status;
u8 *la57_code;
if (!efi_is_64bit())
return EFI_SUCCESS;
/* check for 5 level paging support */
if (native_cpuid_eax(0) < 7 ||
!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
return EFI_SUCCESS;
/* allocate some 32-bit addressable memory for code and a page table */
status = efi_allocate_pages(2 * PAGE_SIZE, (unsigned long *)&la57_code,
U32_MAX);
if (status != EFI_SUCCESS)
return status;
la57_toggle = memcpy(la57_code, trampoline_32bit_src, tmpl_size);
memset(la57_code + tmpl_size, 0x90, PAGE_SIZE - tmpl_size);
/*
* To avoid the need to allocate a 32-bit addressable stack, the
* trampoline uses a LJMP instruction to switch back to long mode.
* LJMP takes an absolute destination address, which needs to be
* fixed up at runtime.
*/
*(u32 *)&la57_code[trampoline_ljmp_imm_offset] += (unsigned long)la57_code;
efi_adjust_memory_range_protection((unsigned long)la57_toggle, PAGE_SIZE);
return EFI_SUCCESS;
}
void efi_5level_switch(void)
{
bool want_la57 = IS_ENABLED(CONFIG_X86_5LEVEL) && !efi_no5lvl;
bool have_la57 = native_read_cr4() & X86_CR4_LA57;
bool need_toggle = want_la57 ^ have_la57;
u64 *pgt = (void *)la57_toggle + PAGE_SIZE;
u64 *cr3 = (u64 *)__native_read_cr3();
u64 *new_cr3;
if (!la57_toggle || !need_toggle)
return;
if (!have_la57) {
/*
* 5 level paging will be enabled, so a root level page needs
* to be allocated from the 32-bit addressable physical region,
* with its first entry referring to the existing hierarchy.
*/
new_cr3 = memset(pgt, 0, PAGE_SIZE);
new_cr3[0] = (u64)cr3 | _PAGE_TABLE_NOENC;
} else {
/* take the new root table pointer from the current entry #0 */
new_cr3 = (u64 *)(cr3[0] & PAGE_MASK);
/* copy the new root table if it is not 32-bit addressable */
if ((u64)new_cr3 > U32_MAX)
new_cr3 = memcpy(pgt, new_cr3, PAGE_SIZE);
}
native_load_gdt(&(struct desc_ptr){ sizeof(gdt) - 1, (u64)gdt });
la57_toggle(new_cr3);
}

View File

@ -15,16 +15,16 @@
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/boot.h>
#include <asm/kaslr.h>
#include <asm/sev.h>
#include "efistub.h"
/* Maximum physical address for 64-bit kernel with 4-level paging */
#define MAXMEM_X86_64_4LEVEL (1ull << 46)
#include "x86-stub.h"
const efi_system_table_t *efi_system_table;
const efi_dxe_services_table_t *efi_dxe_table;
u32 image_offset __section(".data");
static efi_loaded_image_t *image = NULL;
static efi_memory_attribute_protocol_t *memattr;
typedef union sev_memory_acceptance_protocol sev_memory_acceptance_protocol_t;
union sev_memory_acceptance_protocol {
@ -223,8 +223,8 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
}
}
static void
adjust_memory_range_protection(unsigned long start, unsigned long size)
void efi_adjust_memory_range_protection(unsigned long start,
unsigned long size)
{
efi_status_t status;
efi_gcd_memory_space_desc_t desc;
@ -232,12 +232,18 @@ adjust_memory_range_protection(unsigned long start, unsigned long size)
unsigned long rounded_start, rounded_end;
unsigned long unprotect_start, unprotect_size;
if (efi_dxe_table == NULL)
return;
rounded_start = rounddown(start, EFI_PAGE_SIZE);
rounded_end = roundup(start + size, EFI_PAGE_SIZE);
if (memattr != NULL) {
efi_call_proto(memattr, clear_memory_attributes, rounded_start,
rounded_end - rounded_start, EFI_MEMORY_XP);
return;
}
if (efi_dxe_table == NULL)
return;
/*
* Don't modify memory region attributes, they are
* already suitable, to lower the possibility to
@ -278,49 +284,6 @@ adjust_memory_range_protection(unsigned long start, unsigned long size)
}
}
/*
* Trampoline takes 2 pages and can be loaded in first megabyte of memory
* with its end placed between 128k and 640k where BIOS might start.
* (see arch/x86/boot/compressed/pgtable_64.c)
*
* We cannot find exact trampoline placement since memory map
* can be modified by UEFI, and it can alter the computed address.
*/
#define TRAMPOLINE_PLACEMENT_BASE ((128 - 8)*1024)
#define TRAMPOLINE_PLACEMENT_SIZE (640*1024 - (128 - 8)*1024)
void startup_32(struct boot_params *boot_params);
static void
setup_memory_protection(unsigned long image_base, unsigned long image_size)
{
/*
* Allow execution of possible trampoline used
* for switching between 4- and 5-level page tables
* and relocated kernel image.
*/
adjust_memory_range_protection(TRAMPOLINE_PLACEMENT_BASE,
TRAMPOLINE_PLACEMENT_SIZE);
#ifdef CONFIG_64BIT
if (image_base != (unsigned long)startup_32)
adjust_memory_range_protection(image_base, image_size);
#else
/*
* Clear protection flags on a whole range of possible
* addresses used for KASLR. We don't need to do that
* on x86_64, since KASLR/extraction is performed after
* dedicated identity page tables are built and we only
* need to remove possible protection on relocated image
* itself disregarding further relocations.
*/
adjust_memory_range_protection(LOAD_PHYSICAL_ADDR,
KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR);
#endif
}
static void setup_unaccepted_memory(void)
{
efi_guid_t mem_acceptance_proto = OVMF_SEV_MEMORY_ACCEPTANCE_PROTOCOL_GUID;
@ -346,9 +309,7 @@ static void setup_unaccepted_memory(void)
static const efi_char16_t apple[] = L"Apple";
static void setup_quirks(struct boot_params *boot_params,
unsigned long image_base,
unsigned long image_size)
static void setup_quirks(struct boot_params *boot_params)
{
efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long)
efi_table_attr(efi_system_table, fw_vendor);
@ -357,9 +318,6 @@ static void setup_quirks(struct boot_params *boot_params,
if (IS_ENABLED(CONFIG_APPLE_PROPERTIES))
retrieve_apple_device_properties(boot_params);
}
if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES))
setup_memory_protection(image_base, image_size);
}
/*
@ -512,7 +470,6 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
}
image_base = efi_table_attr(image, image_base);
image_offset = (void *)startup_32 - image_base;
status = efi_allocate_pages(sizeof(struct boot_params),
(unsigned long *)&boot_params, ULONG_MAX);
@ -803,19 +760,96 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
return EFI_SUCCESS;
}
/*
* On success, we return the address of startup_32, which has potentially been
* relocated by efi_relocate_kernel.
* On failure, we exit to the firmware via efi_exit instead of returning.
*/
asmlinkage unsigned long efi_main(efi_handle_t handle,
efi_system_table_t *sys_table_arg,
struct boot_params *boot_params)
static bool have_unsupported_snp_features(void)
{
unsigned long bzimage_addr = (unsigned long)startup_32;
unsigned long buffer_start, buffer_end;
u64 unsupported;
unsupported = snp_get_unsupported_features(sev_get_status());
if (unsupported) {
efi_err("Unsupported SEV-SNP features detected: 0x%llx\n",
unsupported);
return true;
}
return false;
}
static void efi_get_seed(void *seed, int size)
{
efi_get_random_bytes(size, seed);
/*
* This only updates seed[0] when running on 32-bit, but in that case,
* seed[1] is not used anyway, as there is no virtual KASLR on 32-bit.
*/
*(unsigned long *)seed ^= kaslr_get_random_long("EFI");
}
static void error(char *str)
{
efi_warn("Decompression failed: %s\n", str);
}
static efi_status_t efi_decompress_kernel(unsigned long *kernel_entry)
{
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
unsigned long addr, alloc_size, entry;
efi_status_t status;
u32 seed[2] = {};
/* determine the required size of the allocation */
alloc_size = ALIGN(max_t(unsigned long, output_len, kernel_total_size),
MIN_KERNEL_ALIGN);
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && !efi_nokaslr) {
u64 range = KERNEL_IMAGE_SIZE - LOAD_PHYSICAL_ADDR - kernel_total_size;
efi_get_seed(seed, sizeof(seed));
virt_addr += (range * seed[1]) >> 32;
virt_addr &= ~(CONFIG_PHYSICAL_ALIGN - 1);
}
status = efi_random_alloc(alloc_size, CONFIG_PHYSICAL_ALIGN, &addr,
seed[0], EFI_LOADER_CODE,
EFI_X86_KERNEL_ALLOC_LIMIT);
if (status != EFI_SUCCESS)
return status;
entry = decompress_kernel((void *)addr, virt_addr, error);
if (entry == ULONG_MAX) {
efi_free(alloc_size, addr);
return EFI_LOAD_ERROR;
}
*kernel_entry = addr + entry;
efi_adjust_memory_range_protection(addr, kernel_total_size);
return EFI_SUCCESS;
}
static void __noreturn enter_kernel(unsigned long kernel_addr,
struct boot_params *boot_params)
{
/* enter decompressed kernel with boot_params pointer in RSI/ESI */
asm("jmp *%0"::"r"(kernel_addr), "S"(boot_params));
unreachable();
}
/*
* On success, this routine will jump to the relocated image directly and never
* return. On failure, it will exit to the firmware via efi_exit() instead of
* returning.
*/
void __noreturn efi_stub_entry(efi_handle_t handle,
efi_system_table_t *sys_table_arg,
struct boot_params *boot_params)
{
efi_guid_t guid = EFI_MEMORY_ATTRIBUTE_PROTOCOL_GUID;
struct setup_header *hdr = &boot_params->hdr;
const struct linux_efi_initrd *initrd = NULL;
unsigned long kernel_entry;
efi_status_t status;
efi_system_table = sys_table_arg;
@ -823,65 +857,25 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
efi_exit(handle, EFI_INVALID_PARAMETER);
efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID);
if (efi_dxe_table &&
efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) {
efi_warn("Ignoring DXE services table: invalid signature\n");
efi_dxe_table = NULL;
if (have_unsupported_snp_features())
efi_exit(handle, EFI_UNSUPPORTED);
if (IS_ENABLED(CONFIG_EFI_DXE_MEM_ATTRIBUTES)) {
efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID);
if (efi_dxe_table &&
efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) {
efi_warn("Ignoring DXE services table: invalid signature\n");
efi_dxe_table = NULL;
}
}
/*
* If the kernel isn't already loaded at a suitable address,
* relocate it.
*
* It must be loaded above LOAD_PHYSICAL_ADDR.
*
* The maximum address for 64-bit is 1 << 46 for 4-level paging. This
* is defined as the macro MAXMEM, but unfortunately that is not a
* compile-time constant if 5-level paging is configured, so we instead
* define our own macro for use here.
*
* For 32-bit, the maximum address is complicated to figure out, for
* now use KERNEL_IMAGE_SIZE, which will be 512MiB, the same as what
* KASLR uses.
*
* Also relocate it if image_offset is zero, i.e. the kernel wasn't
* loaded by LoadImage, but rather by a bootloader that called the
* handover entry. The reason we must always relocate in this case is
* to handle the case of systemd-boot booting a unified kernel image,
* which is a PE executable that contains the bzImage and an initrd as
* COFF sections. The initrd section is placed after the bzImage
* without ensuring that there are at least init_size bytes available
* for the bzImage, and thus the compressed kernel's startup code may
* overwrite the initrd unless it is moved out of the way.
*/
/* grab the memory attributes protocol if it exists */
efi_bs_call(locate_protocol, &guid, NULL, (void **)&memattr);
buffer_start = ALIGN(bzimage_addr - image_offset,
hdr->kernel_alignment);
buffer_end = buffer_start + hdr->init_size;
if ((buffer_start < LOAD_PHYSICAL_ADDR) ||
(IS_ENABLED(CONFIG_X86_32) && buffer_end > KERNEL_IMAGE_SIZE) ||
(IS_ENABLED(CONFIG_X86_64) && buffer_end > MAXMEM_X86_64_4LEVEL) ||
(image_offset == 0)) {
extern char _bss[];
status = efi_relocate_kernel(&bzimage_addr,
(unsigned long)_bss - bzimage_addr,
hdr->init_size,
hdr->pref_address,
hdr->kernel_alignment,
LOAD_PHYSICAL_ADDR);
if (status != EFI_SUCCESS) {
efi_err("efi_relocate_kernel() failed!\n");
goto fail;
}
/*
* Now that we've copied the kernel elsewhere, we no longer
* have a set up block before startup_32(), so reset image_offset
* to zero in case it was set earlier.
*/
image_offset = 0;
status = efi_setup_5level_paging();
if (status != EFI_SUCCESS) {
efi_err("efi_setup_5level_paging() failed!\n");
goto fail;
}
#ifdef CONFIG_CMDLINE_BOOL
@ -901,6 +895,12 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
}
}
status = efi_decompress_kernel(&kernel_entry);
if (status != EFI_SUCCESS) {
efi_err("Failed to decompress kernel\n");
goto fail;
}
/*
* At this point, an initrd may already have been loaded by the
* bootloader and passed via bootparams. We permit an initrd loaded
@ -940,7 +940,7 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
setup_efi_pci(boot_params);
setup_quirks(boot_params, bzimage_addr, buffer_end - buffer_start);
setup_quirks(boot_params);
setup_unaccepted_memory();
@ -950,9 +950,38 @@ asmlinkage unsigned long efi_main(efi_handle_t handle,
goto fail;
}
return bzimage_addr;
/*
* Call the SEV init code while still running with the firmware's
* GDT/IDT, so #VC exceptions will be handled by EFI.
*/
sev_enable(boot_params);
efi_5level_switch();
enter_kernel(kernel_entry, boot_params);
fail:
efi_err("efi_main() failed!\n");
efi_err("efi_stub_entry() failed!\n");
efi_exit(handle, status);
}
#ifdef CONFIG_EFI_HANDOVER_PROTOCOL
void efi_handover_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
struct boot_params *boot_params)
{
extern char _bss[], _ebss[];
memset(_bss, 0, _ebss - _bss);
efi_stub_entry(handle, sys_table_arg, boot_params);
}
#ifndef CONFIG_EFI_MIXED
extern __alias(efi_handover_entry)
void efi32_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
struct boot_params *boot_params);
extern __alias(efi_handover_entry)
void efi64_stub_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg,
struct boot_params *boot_params);
#endif
#endif

View File

@ -0,0 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/efi.h>
extern void trampoline_32bit_src(void *, bool);
extern const u16 trampoline_ljmp_imm_offset;
void efi_adjust_memory_range_protection(unsigned long start,
unsigned long size);
#ifdef CONFIG_X86_64
efi_status_t efi_setup_5level_paging(void);
void efi_5level_switch(void);
#else
static inline efi_status_t efi_setup_5level_paging(void) { return EFI_SUCCESS; }
static inline void efi_5level_switch(void) {}
#endif

View File

@ -119,7 +119,7 @@ efi_zboot_entry(efi_handle_t handle, efi_system_table_t *systab)
}
status = efi_random_alloc(alloc_size, min_kimg_align, &image_base,
seed, EFI_LOADER_CODE);
seed, EFI_LOADER_CODE, EFI_ALLOC_LIMIT);
if (status != EFI_SUCCESS) {
efi_err("Failed to allocate memory\n");
goto free_cmdline;

View File

@ -48,7 +48,7 @@ MALLOC_VISIBLE void *malloc(int size)
if (!malloc_ptr)
malloc_ptr = free_mem_ptr;
malloc_ptr = (malloc_ptr + 3) & ~3; /* Align */
malloc_ptr = (malloc_ptr + 7) & ~7; /* Align */
p = (void *)malloc_ptr;
malloc_ptr += size;