From 3f874c9b2aae8e30463efc1872bea4baa9ed25dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Aug 2023 20:52:20 +0200 Subject: [PATCH 1/4] x86/smp: Don't send INIT to non-present and non-booted CPUs Vasant reported that kexec() can hang or reset the machine when it tries to park CPUs via INIT. This happens when the kernel is using extended APIC, but the present mask has APIC IDs >= 0x100 enumerated. As extended APIC can only handle 8 bit of APIC ID sending INIT to APIC ID 0x100 sends INIT to APIC ID 0x0. That's the boot CPU which is special on x86 and INIT causes the system to hang or resets the machine. Prevent this by sending INIT only to those CPUs which have been booted once. Fixes: 45e34c8af58f ("x86/smp: Put CPUs into INIT on shutdown if possible") Reported-by: Dheeraj Kumar Srivastava Signed-off-by: Thomas Gleixner Tested-by: Vasant Hegde Link: https://lore.kernel.org/r/87cyzwjbff.ffs@tglx --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d7667a29acf3..4e45ff44aa07 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1250,7 +1250,7 @@ bool smp_park_other_cpus_in_init(void) if (this_cpu) return false; - for_each_present_cpu(cpu) { + for_each_cpu_and(cpu, &cpus_booted_once_mask, cpu_present_mask) { if (cpu == this_cpu) continue; apicid = apic->cpu_present_to_apicid(cpu); From 65e710899fd19f435f40268f3a92dfaa11f14470 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Sep 2023 10:52:15 -0700 Subject: [PATCH 2/4] x86/build: Fix linker fill bytes quirk/incompatibility for ld.lld With ":text =0xcccc", ld.lld fills unused text area with 0xcccc0000. Example objdump -D output: ffffffff82b04203: 00 00 add %al,(%rax) ffffffff82b04205: cc int3 ffffffff82b04206: cc int3 ffffffff82b04207: 00 00 add %al,(%rax) ffffffff82b04209: cc int3 ffffffff82b0420a: cc int3 Replace it with ":text =0xcccccccc", so we get the following instead: ffffffff82b04203: cc int3 ffffffff82b04204: cc int3 ffffffff82b04205: cc int3 ffffffff82b04206: cc int3 ffffffff82b04207: cc int3 ffffffff82b04208: cc int3 gcc/ld doesn't seem to have the same issue. The generated code stays the same for gcc/ld. Signed-off-by: Song Liu Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Acked-by: Peter Zijlstra (Intel) Fixes: 7705dc855797 ("x86/vmlinux: Use INT3 instead of NOP for linker fill bytes") Link: https://lore.kernel.org/r/20230906175215.2236033-1-song@kernel.org --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 83d41c2601d7..f15fb71f280e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -156,7 +156,7 @@ SECTIONS ALIGN_ENTRY_TEXT_END *(.gnu.warning) - } :text =0xcccc + } :text = 0xcccccccc /* End of text section, which should occupy whole number of pages */ _etext = .; From 659df86a7b2fe98feb5f4ec880e694caaebd27ae Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Wed, 6 Sep 2023 18:26:58 +0200 Subject: [PATCH 3/4] x86: Remove the arch_calc_vm_prot_bits() macro from the UAPI The arch_calc_vm_prot_bits() macro uses VM_PKEY_BIT0 etc. which are not part of the UAPI, so the macro is completely useless for userspace. It is also hidden behind the CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS config switch which we shouldn't expose to userspace. Thus let's move this macro into a new internal header instead. Fixes: 8f62c883222c ("x86/mm/pkeys: Add arch-specific VMA protection bits") Signed-off-by: Thomas Huth Signed-off-by: Ingo Molnar Reviewed-by: Arnd Bergmann Reviewed-by: Nicolas Schier Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20230906162658.142511-1-thuth@redhat.com --- arch/x86/include/asm/mman.h | 15 +++++++++++++++ arch/x86/include/uapi/asm/mman.h | 8 -------- scripts/headers_install.sh | 1 - 3 files changed, 15 insertions(+), 9 deletions(-) create mode 100644 arch/x86/include/asm/mman.h diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h new file mode 100644 index 000000000000..12b820259b9f --- /dev/null +++ b/arch/x86/include/asm/mman.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_MMAN_H__ +#define __ASM_MMAN_H__ + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#define arch_calc_vm_prot_bits(prot, key) ( \ + ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ + ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ + ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ + ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) +#endif + +#include + +#endif /* __ASM_MMAN_H__ */ diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index 8148bdddbd2c..46cdc941f958 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -5,14 +5,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ #define MAP_ABOVE4G 0x80 /* only map above 4GB */ -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -#define arch_calc_vm_prot_bits(prot, key) ( \ - ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ - ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ - ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \ - ((key) & 0x8 ? VM_PKEY_BIT3 : 0)) -#endif - /* Flags for map_shadow_stack(2) */ #define SHADOW_STACK_SET_TOKEN (1ULL << 0) /* Set up a restore token in the shadow stack */ diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh index afdddc82f02b..56d3c338d91d 100755 --- a/scripts/headers_install.sh +++ b/scripts/headers_install.sh @@ -81,7 +81,6 @@ arch/nios2/include/uapi/asm/swab.h:CONFIG_NIOS2_CI_SWAB_NO arch/nios2/include/uapi/asm/swab.h:CONFIG_NIOS2_CI_SWAB_SUPPORT arch/x86/include/uapi/asm/auxvec.h:CONFIG_IA32_EMULATION arch/x86/include/uapi/asm/auxvec.h:CONFIG_X86_64 -arch/x86/include/uapi/asm/mman.h:CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS " for c in $configs From 3d7d72a34e05b23e21bafc8bfb861e73c86b31f3 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Wed, 6 Sep 2023 15:17:12 +0200 Subject: [PATCH 4/4] x86/sgx: Break up long non-preemptible delays in sgx_vepc_release() On large enclaves we hit the softlockup warning with following call trace: xa_erase() sgx_vepc_release() __fput() task_work_run() do_exit() The latency issue is similar to the one fixed in: 8795359e35bc ("x86/sgx: Silence softlockup detection when releasing large enclaves") The test system has 64GB of enclave memory, and all is assigned to a single VM. Release of 'vepc' takes a longer time and causes long latencies, which triggers the softlockup warning. Add cond_resched() to give other tasks a chance to run and reduce latencies, which also avoids the softlockup detector. [ mingo: Rewrote the changelog. ] Fixes: 540745ddbc70 ("x86/sgx: Introduce virtual EPC for use by KVM guests") Reported-by: Yu Zhang Signed-off-by: Jack Wang Signed-off-by: Ingo Molnar Tested-by: Yu Zhang Reviewed-by: Jarkko Sakkinen Reviewed-by: Kai Huang Acked-by: Haitao Huang Cc: stable@vger.kernel.org --- arch/x86/kernel/cpu/sgx/virt.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index c3e37eaec8ec..7aaa3652e31d 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -204,6 +204,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) continue; xa_erase(&vepc->page_array, index); + cond_resched(); } /* @@ -222,6 +223,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) list_add_tail(&epc_page->list, &secs_pages); xa_erase(&vepc->page_array, index); + cond_resched(); } /* @@ -243,6 +245,7 @@ static int sgx_vepc_release(struct inode *inode, struct file *file) if (sgx_vepc_free_page(epc_page)) list_add_tail(&epc_page->list, &secs_pages); + cond_resched(); } if (!list_empty(&secs_pages))