From 6973091d1b50ab4042f6a2d495f59e9db3662ab8 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Tue, 11 Oct 2022 18:07:12 +0200 Subject: [PATCH 01/20] KVM: s390: pv: don't allow userspace to set the clock under PV When running under PV, the guest's TOD clock is under control of the ultravisor and the hypervisor isn't allowed to change it. Hence, don't allow userspace to change the guest's TOD clock by returning -EOPNOTSUPP. When userspace changes the guest's TOD clock, KVM updates its kvm.arch.epoch field and, in addition, the epoch field in all state descriptions of all VCPUs. But, under PV, the ultravisor will ignore the epoch field in the state description and simply overwrite it on next SIE exit with the actual guest epoch. This leads to KVM having an incorrect view of the guest's TOD clock: it has updated its internal kvm.arch.epoch field, but the ultravisor ignores the field in the state description. Whenever a guest is now waiting for a clock comparator, KVM will incorrectly calculate the time when the guest should wake up, possibly causing the guest to sleep for much longer than expected. With this change, kvm_s390_set_tod() will now take the kvm->lock to be able to call kvm_s390_pv_is_protected(). Since kvm_s390_set_tod_clock() also takes kvm->lock, use __kvm_s390_set_tod_clock() instead. The function kvm_s390_set_tod_clock is now unused, hence remove it. Update the documentation to indicate the TOD clock attr calls can now return -EOPNOTSUPP. Fixes: 0f3035047140 ("KVM: s390: protvirt: Do only reset registers that are accessible") Reported-by: Marc Hartmayer Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20221011160712.928239-2-nrb@linux.ibm.com Message-Id: <20221011160712.928239-2-nrb@linux.ibm.com> Signed-off-by: Janosch Frank --- Documentation/virt/kvm/devices/vm.rst | 3 +++ arch/s390/kvm/kvm-s390.c | 26 +++++++++++++++++--------- arch/s390/kvm/kvm-s390.h | 1 - 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/Documentation/virt/kvm/devices/vm.rst b/Documentation/virt/kvm/devices/vm.rst index 0aa5b1cfd700..60acc39e0e93 100644 --- a/Documentation/virt/kvm/devices/vm.rst +++ b/Documentation/virt/kvm/devices/vm.rst @@ -215,6 +215,7 @@ KVM_S390_VM_TOD_EXT). :Parameters: address of a buffer in user space to store the data (u8) to :Returns: -EFAULT if the given address is not accessible from kernel space; -EINVAL if setting the TOD clock extension to != 0 is not supported + -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor) 3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW ----------------------------------- @@ -224,6 +225,7 @@ the POP (u64). :Parameters: address of a buffer in user space to store the data (u64) to :Returns: -EFAULT if the given address is not accessible from kernel space + -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor) 3.3. ATTRIBUTE: KVM_S390_VM_TOD_EXT ----------------------------------- @@ -237,6 +239,7 @@ it, it is stored as 0 and not allowed to be set to a value != 0. (kvm_s390_vm_tod_clock) to :Returns: -EFAULT if the given address is not accessible from kernel space; -EINVAL if setting the TOD clock extension to != 0 is not supported + -EOPNOTSUPP for a PV guest (TOD managed by the ultravisor) 4. GROUP: KVM_S390_VM_CRYPTO ============================ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 45d4b8182b07..bc491a73815c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1207,6 +1207,8 @@ static int kvm_s390_vm_get_migration(struct kvm *kvm, return 0; } +static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); + static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) { struct kvm_s390_vm_tod_clock gtod; @@ -1216,7 +1218,7 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx) return -EINVAL; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", gtod.epoch_idx, gtod.tod); @@ -1247,7 +1249,7 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) sizeof(gtod.tod))) return -EFAULT; - kvm_s390_set_tod_clock(kvm, >od); + __kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod); return 0; } @@ -1259,6 +1261,16 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) if (attr->flags) return -EINVAL; + mutex_lock(&kvm->lock); + /* + * For protected guests, the TOD is managed by the ultravisor, so trying + * to change it will never bring the expected results. + */ + if (kvm_s390_pv_is_protected(kvm)) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + switch (attr->attr) { case KVM_S390_VM_TOD_EXT: ret = kvm_s390_set_tod_ext(kvm, attr); @@ -1273,6 +1285,9 @@ static int kvm_s390_set_tod(struct kvm *kvm, struct kvm_device_attr *attr) ret = -ENXIO; break; } + +out_unlock: + mutex_unlock(&kvm->lock); return ret; } @@ -4377,13 +4392,6 @@ static void __kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_t preempt_enable(); } -void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) -{ - mutex_lock(&kvm->lock); - __kvm_s390_set_tod_clock(kvm, gtod); - mutex_unlock(&kvm->lock); -} - int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod) { if (!mutex_trylock(&kvm->lock)) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index f6fd668f887e..4755492dfabc 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -363,7 +363,6 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ -void kvm_s390_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); int kvm_s390_try_set_tod_clock(struct kvm *kvm, const struct kvm_s390_vm_tod_clock *gtod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); From b6662e37772715447aeff2538444ff291e02ea31 Mon Sep 17 00:00:00 2001 From: Rafael Mendonca Date: Tue, 25 Oct 2022 22:32:33 -0300 Subject: [PATCH 02/20] KVM: s390: pci: Fix allocation size of aift kzdev elements The 'kzdev' field of struct 'zpci_aift' is an array of pointers to 'kvm_zdev' structs. Allocate the proper size accordingly. Reported by Coccinelle: WARNING: Use correct pointer type argument for sizeof Fixes: 98b1d33dac5f ("KVM: s390: pci: do initial setup for AEN interpretation") Signed-off-by: Rafael Mendonca Reviewed-by: Christian Borntraeger Reviewed-by: Matthew Rosato Link: https://lore.kernel.org/r/20221026013234.960859-1-rafaelmendsr@gmail.com Message-Id: <20221026013234.960859-1-rafaelmendsr@gmail.com> Signed-off-by: Janosch Frank --- arch/s390/kvm/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index c50c1645c0ae..ded1af2ddae9 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -126,7 +126,7 @@ int kvm_s390_pci_aen_init(u8 nisc) return -EPERM; mutex_lock(&aift->aift_lock); - aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev), + aift->kzdev = kcalloc(ZPCI_NR_DEVICES, sizeof(struct kvm_zdev *), GFP_KERNEL); if (!aift->kzdev) { rc = -ENOMEM; From debc5a1ec0d195ffea70d11efeffb713de9fdbc7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 8 Nov 2022 09:44:53 +0100 Subject: [PATCH 03/20] KVM: x86: use a separate asm-offsets.c file This already removes an ugly #include "" from asm-offsets.c, but especially it avoids a future error when trying to define asm-offsets for KVM's svm/svm.h header. This would not work for kernel/asm-offsets.c, because svm/svm.h includes kvm_cache_regs.h which is not in the include path when compiling asm-offsets.c. The problem is not there if the .c file is in arch/x86/kvm. Suggested-by: Sean Christopherson Cc: stable@vger.kernel.org Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk") Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kernel/asm-offsets.c | 6 ------ arch/x86/kvm/.gitignore | 2 ++ arch/x86/kvm/Makefile | 9 +++++++++ arch/x86/kvm/kvm-asm-offsets.c | 18 ++++++++++++++++++ arch/x86/kvm/vmx/vmenter.S | 2 +- 5 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 arch/x86/kvm/.gitignore create mode 100644 arch/x86/kvm/kvm-asm-offsets.c diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cb50589a7102..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -19,7 +19,6 @@ #include #include #include -#include "../kvm/vmx/vmx.h" #ifdef CONFIG_XEN #include @@ -108,9 +107,4 @@ static void __used common(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); OFFSET(TSS_sp2, tss_struct, x86_tss.sp2); - - if (IS_ENABLED(CONFIG_KVM_INTEL)) { - BLANK(); - OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); - } } diff --git a/arch/x86/kvm/.gitignore b/arch/x86/kvm/.gitignore new file mode 100644 index 000000000000..615d6ff35c00 --- /dev/null +++ b/arch/x86/kvm/.gitignore @@ -0,0 +1,2 @@ +/kvm-asm-offsets.s +/kvm-asm-offsets.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 30f244b64523..a02cf9baacc8 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -34,3 +34,12 @@ endif obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o + +AFLAGS_vmx/vmenter.o := -iquote $(obj) +$(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h + +$(obj)/kvm-asm-offsets.h: $(obj)/kvm-asm-offsets.s FORCE + $(call filechk,offsets,__KVM_ASM_OFFSETS_H__) + +targets += kvm-asm-offsets.s +clean-files += kvm-asm-offsets.h diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c new file mode 100644 index 000000000000..9d84f2b32d7f --- /dev/null +++ b/arch/x86/kvm/kvm-asm-offsets.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed to extract + * and format the required data. + */ +#define COMPILE_OFFSETS + +#include +#include "vmx/vmx.h" + +static void __used common(void) +{ + if (IS_ENABLED(CONFIG_KVM_INTEL)) { + BLANK(); + OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); + } +} diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 8477d8bdd69c..0b5db4de4d09 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -1,12 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include -#include #include #include #include #include #include +#include "kvm-asm-offsets.h" #include "run_flags.h" #define WORD_SIZE (BITS_PER_LONG / 8) From 16fdc1de169ee0a4e59a8c02244414ec7acd55c3 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 30 Sep 2022 14:14:44 -0400 Subject: [PATCH 04/20] KVM: SVM: replace regs argument of __svm_vcpu_run() with vcpu_svm Since registers are reachable through vcpu_svm, and we will need to access more fields of that struct, pass it instead of the regs[] array. No functional change intended. Cc: stable@vger.kernel.org Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk") Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 3 +++ arch/x86/kvm/kvm-asm-offsets.c | 6 ++++++ arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/svm/vmenter.S | 37 +++++++++++++++++----------------- 5 files changed, 30 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index a02cf9baacc8..f453a0f96e24 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -35,6 +35,9 @@ obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o +AFLAGS_svm/vmenter.o := -iquote $(obj) +$(obj)/svm/vmenter.o: $(obj)/kvm-asm-offsets.h + AFLAGS_vmx/vmenter.o := -iquote $(obj) $(obj)/vmx/vmenter.o: $(obj)/kvm-asm-offsets.h diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c index 9d84f2b32d7f..30db96852e2d 100644 --- a/arch/x86/kvm/kvm-asm-offsets.c +++ b/arch/x86/kvm/kvm-asm-offsets.c @@ -8,9 +8,15 @@ #include #include "vmx/vmx.h" +#include "svm/svm.h" static void __used common(void) { + if (IS_ENABLED(CONFIG_KVM_AMD)) { + BLANK(); + OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs); + } + if (IS_ENABLED(CONFIG_KVM_INTEL)) { BLANK(); OFFSET(VMX_spec_ctrl, vcpu_vmx, spec_ctrl); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 58f0077d9357..b412bc5773c5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3930,7 +3930,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) * vmcb02 when switching vmcbs for nested virtualization. */ vmload(svm->vmcb01.pa); - __svm_vcpu_run(vmcb_pa, (unsigned long *)&vcpu->arch.regs); + __svm_vcpu_run(vmcb_pa, svm); vmsave(svm->vmcb01.pa); vmload(__sme_page_pa(sd->save_area)); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6a7686bf6900..447e25c9101a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -684,6 +684,6 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ void __svm_sev_es_vcpu_run(unsigned long vmcb_pa); -void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); +void __svm_vcpu_run(unsigned long vmcb_pa, struct vcpu_svm *svm); #endif diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 723f8534986c..f0ff41103e4c 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -4,27 +4,28 @@ #include #include #include +#include "kvm-asm-offsets.h" #define WORD_SIZE (BITS_PER_LONG / 8) /* Intentionally omit RAX as it's context switched by hardware */ -#define VCPU_RCX __VCPU_REGS_RCX * WORD_SIZE -#define VCPU_RDX __VCPU_REGS_RDX * WORD_SIZE -#define VCPU_RBX __VCPU_REGS_RBX * WORD_SIZE +#define VCPU_RCX (SVM_vcpu_arch_regs + __VCPU_REGS_RCX * WORD_SIZE) +#define VCPU_RDX (SVM_vcpu_arch_regs + __VCPU_REGS_RDX * WORD_SIZE) +#define VCPU_RBX (SVM_vcpu_arch_regs + __VCPU_REGS_RBX * WORD_SIZE) /* Intentionally omit RSP as it's context switched by hardware */ -#define VCPU_RBP __VCPU_REGS_RBP * WORD_SIZE -#define VCPU_RSI __VCPU_REGS_RSI * WORD_SIZE -#define VCPU_RDI __VCPU_REGS_RDI * WORD_SIZE +#define VCPU_RBP (SVM_vcpu_arch_regs + __VCPU_REGS_RBP * WORD_SIZE) +#define VCPU_RSI (SVM_vcpu_arch_regs + __VCPU_REGS_RSI * WORD_SIZE) +#define VCPU_RDI (SVM_vcpu_arch_regs + __VCPU_REGS_RDI * WORD_SIZE) #ifdef CONFIG_X86_64 -#define VCPU_R8 __VCPU_REGS_R8 * WORD_SIZE -#define VCPU_R9 __VCPU_REGS_R9 * WORD_SIZE -#define VCPU_R10 __VCPU_REGS_R10 * WORD_SIZE -#define VCPU_R11 __VCPU_REGS_R11 * WORD_SIZE -#define VCPU_R12 __VCPU_REGS_R12 * WORD_SIZE -#define VCPU_R13 __VCPU_REGS_R13 * WORD_SIZE -#define VCPU_R14 __VCPU_REGS_R14 * WORD_SIZE -#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE +#define VCPU_R8 (SVM_vcpu_arch_regs + __VCPU_REGS_R8 * WORD_SIZE) +#define VCPU_R9 (SVM_vcpu_arch_regs + __VCPU_REGS_R9 * WORD_SIZE) +#define VCPU_R10 (SVM_vcpu_arch_regs + __VCPU_REGS_R10 * WORD_SIZE) +#define VCPU_R11 (SVM_vcpu_arch_regs + __VCPU_REGS_R11 * WORD_SIZE) +#define VCPU_R12 (SVM_vcpu_arch_regs + __VCPU_REGS_R12 * WORD_SIZE) +#define VCPU_R13 (SVM_vcpu_arch_regs + __VCPU_REGS_R13 * WORD_SIZE) +#define VCPU_R14 (SVM_vcpu_arch_regs + __VCPU_REGS_R14 * WORD_SIZE) +#define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE) #endif .section .noinstr.text, "ax" @@ -32,7 +33,7 @@ /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode * @vmcb_pa: unsigned long - * @regs: unsigned long * (to guest registers) + * @svm: struct vcpu_svm * */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP @@ -47,13 +48,13 @@ SYM_FUNC_START(__svm_vcpu_run) #endif push %_ASM_BX - /* Save @regs. */ + /* Save @svm. */ push %_ASM_ARG2 /* Save @vmcb. */ push %_ASM_ARG1 - /* Move @regs to RAX. */ + /* Move @svm to RAX. */ mov %_ASM_ARG2, %_ASM_AX /* Load guest registers. */ @@ -89,7 +90,7 @@ SYM_FUNC_START(__svm_vcpu_run) FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif - /* "POP" @regs to RAX. */ + /* "POP" @svm to RAX. */ pop %_ASM_AX /* Save all guest registers. */ From f7ef280132f9bf6f82acf5aa5c3c837206eef501 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 28 Oct 2022 17:30:07 -0400 Subject: [PATCH 05/20] KVM: SVM: adjust register allocation for __svm_vcpu_run() 32-bit ABI uses RAX/RCX/RDX as its argument registers, so they are in the way of instructions that hardcode their operands such as RDMSR/WRMSR or VMLOAD/VMRUN/VMSAVE. In preparation for moving vmload/vmsave to __svm_vcpu_run(), keep the pointer to the struct vcpu_svm in %rdi. In particular, it is now possible to load svm->vmcb01.pa in %rax without clobbering the struct vcpu_svm pointer. No functional change intended. Cc: stable@vger.kernel.org Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk") Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/vmenter.S | 40 +++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index f0ff41103e4c..531510ab6072 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -54,30 +54,30 @@ SYM_FUNC_START(__svm_vcpu_run) /* Save @vmcb. */ push %_ASM_ARG1 - /* Move @svm to RAX. */ - mov %_ASM_ARG2, %_ASM_AX - - /* Load guest registers. */ - mov VCPU_RCX(%_ASM_AX), %_ASM_CX - mov VCPU_RDX(%_ASM_AX), %_ASM_DX - mov VCPU_RBX(%_ASM_AX), %_ASM_BX - mov VCPU_RBP(%_ASM_AX), %_ASM_BP - mov VCPU_RSI(%_ASM_AX), %_ASM_SI - mov VCPU_RDI(%_ASM_AX), %_ASM_DI -#ifdef CONFIG_X86_64 - mov VCPU_R8 (%_ASM_AX), %r8 - mov VCPU_R9 (%_ASM_AX), %r9 - mov VCPU_R10(%_ASM_AX), %r10 - mov VCPU_R11(%_ASM_AX), %r11 - mov VCPU_R12(%_ASM_AX), %r12 - mov VCPU_R13(%_ASM_AX), %r13 - mov VCPU_R14(%_ASM_AX), %r14 - mov VCPU_R15(%_ASM_AX), %r15 -#endif + /* Move @svm to RDI. */ + mov %_ASM_ARG2, %_ASM_DI /* "POP" @vmcb to RAX. */ pop %_ASM_AX + /* Load guest registers. */ + mov VCPU_RCX(%_ASM_DI), %_ASM_CX + mov VCPU_RDX(%_ASM_DI), %_ASM_DX + mov VCPU_RBX(%_ASM_DI), %_ASM_BX + mov VCPU_RBP(%_ASM_DI), %_ASM_BP + mov VCPU_RSI(%_ASM_DI), %_ASM_SI +#ifdef CONFIG_X86_64 + mov VCPU_R8 (%_ASM_DI), %r8 + mov VCPU_R9 (%_ASM_DI), %r9 + mov VCPU_R10(%_ASM_DI), %r10 + mov VCPU_R11(%_ASM_DI), %r11 + mov VCPU_R12(%_ASM_DI), %r12 + mov VCPU_R13(%_ASM_DI), %r13 + mov VCPU_R14(%_ASM_DI), %r14 + mov VCPU_R15(%_ASM_DI), %r15 +#endif + mov VCPU_RDI(%_ASM_DI), %_ASM_DI + /* Enter guest mode */ sti From f6d58266d731fd7e63163790aad21e0dbb1d5264 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 7 Nov 2022 04:17:29 -0500 Subject: [PATCH 06/20] KVM: SVM: retrieve VMCB from assembly Continue moving accesses to struct vcpu_svm to vmenter.S. Reducing the number of arguments limits the chance of mistakes due to different registers used for argument passing in 32- and 64-bit ABIs; pushing the VMCB argument and almost immediately popping it into a different register looks pretty weird. 32-bit ABI is not a concern for __svm_sev_es_vcpu_run() which is 64-bit only; however, it will soon need @svm to save/restore SPEC_CTRL so stay consistent with __svm_vcpu_run() and let them share the same prototype. No functional change intended. Cc: stable@vger.kernel.org Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk") Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm-asm-offsets.c | 2 ++ arch/x86/kvm/svm/svm.c | 5 ++--- arch/x86/kvm/svm/svm.h | 4 ++-- arch/x86/kvm/svm/vmenter.S | 20 ++++++++++---------- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c index 30db96852e2d..f1b694e431ae 100644 --- a/arch/x86/kvm/kvm-asm-offsets.c +++ b/arch/x86/kvm/kvm-asm-offsets.c @@ -15,6 +15,8 @@ static void __used common(void) if (IS_ENABLED(CONFIG_KVM_AMD)) { BLANK(); OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs); + OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb); + OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa); } if (IS_ENABLED(CONFIG_KVM_INTEL)) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b412bc5773c5..0c86c435c51f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3914,12 +3914,11 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - unsigned long vmcb_pa = svm->current_vmcb->pa; guest_state_enter_irqoff(); if (sev_es_guest(vcpu->kvm)) { - __svm_sev_es_vcpu_run(vmcb_pa); + __svm_sev_es_vcpu_run(svm); } else { struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); @@ -3930,7 +3929,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) * vmcb02 when switching vmcbs for nested virtualization. */ vmload(svm->vmcb01.pa); - __svm_vcpu_run(vmcb_pa, svm); + __svm_vcpu_run(svm); vmsave(svm->vmcb01.pa); vmload(__sme_page_pa(sd->save_area)); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 447e25c9101a..7ff1879e73c5 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -683,7 +683,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ -void __svm_sev_es_vcpu_run(unsigned long vmcb_pa); -void __svm_vcpu_run(unsigned long vmcb_pa, struct vcpu_svm *svm); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm); +void __svm_vcpu_run(struct vcpu_svm *svm); #endif diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 531510ab6072..d07bac1952c5 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -32,7 +32,6 @@ /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode - * @vmcb_pa: unsigned long * @svm: struct vcpu_svm * */ SYM_FUNC_START(__svm_vcpu_run) @@ -49,16 +48,16 @@ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BX /* Save @svm. */ - push %_ASM_ARG2 - - /* Save @vmcb. */ push %_ASM_ARG1 +.ifnc _ASM_ARG1, _ASM_DI /* Move @svm to RDI. */ - mov %_ASM_ARG2, %_ASM_DI + mov %_ASM_ARG1, %_ASM_DI +.endif - /* "POP" @vmcb to RAX. */ - pop %_ASM_AX + /* Get svm->current_vmcb->pa into RAX. */ + mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX + mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX /* Load guest registers. */ mov VCPU_RCX(%_ASM_DI), %_ASM_CX @@ -170,7 +169,7 @@ SYM_FUNC_END(__svm_vcpu_run) /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode - * @vmcb_pa: unsigned long + * @svm: struct vcpu_svm * */ SYM_FUNC_START(__svm_sev_es_vcpu_run) push %_ASM_BP @@ -185,8 +184,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) #endif push %_ASM_BX - /* Move @vmcb to RAX. */ - mov %_ASM_ARG1, %_ASM_AX + /* Get svm->current_vmcb->pa into RAX. */ + mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX + mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX /* Enter guest mode */ sti From 001459787158280d84fce1bc94cec49f34f48b9f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 9 Nov 2022 04:54:40 -0500 Subject: [PATCH 07/20] KVM: SVM: remove unused field from struct vcpu_svm The pointer to svm_cpu_data in struct vcpu_svm looks interesting from the point of view of accessing it after vmexit, when the GSBASE is still containing the guest value. However, despite existing since the very first commit of drivers/kvm/svm.c (commit 6aa8b732ca01, "[PATCH] kvm: userspace interface", 2006-12-10), it was never set to anything. Ignore the opportunity to fix a 16 year old "bug" and delete it; doing things the "harder" way makes it possible to remove more old cruft. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7ff1879e73c5..626240707ba9 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -209,7 +209,6 @@ struct vcpu_svm { struct vmcb *vmcb; struct kvm_vmcb_info vmcb01; struct kvm_vmcb_info *current_vmcb; - struct svm_cpu_data *svm_data; u32 asid; u32 sysenter_esp_hi; u32 sysenter_eip_hi; From 181d0fb0bb023e8996b1cf7970e3708d72442b0b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 9 Nov 2022 08:54:20 -0500 Subject: [PATCH 08/20] KVM: SVM: remove dead field from struct svm_cpu_data The "cpu" field of struct svm_cpu_data has been write-only since commit 4b656b120249 ("KVM: SVM: force new asid on vcpu migration", 2009-08-05). Remove it. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/svm.c | 1 - arch/x86/kvm/svm/svm.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0c86c435c51f..0f873b298931 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -665,7 +665,6 @@ static int svm_cpu_init(int cpu) sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); if (!sd) return ret; - sd->cpu = cpu; sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!sd->save_area) goto free_cpu_data; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 626240707ba9..7540db9902a6 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -280,8 +280,6 @@ struct vcpu_svm { }; struct svm_cpu_data { - int cpu; - u64 asid_generation; u32 max_asid; u32 next_asid; From 73412dfeea724e6bd775ba64d21157ff322eac9a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 9 Nov 2022 09:07:55 -0500 Subject: [PATCH 09/20] KVM: SVM: do not allocate struct svm_cpu_data dynamically The svm_data percpu variable is a pointer, but it is allocated via svm_hardware_setup() when KVM is loaded. Unlike hardware_enable() this means that it is never NULL for the whole lifetime of KVM, and static allocation does not waste any memory compared to the status quo. It is also more efficient and more easily handled from assembly code, so do it and don't look back. Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 4 ++-- arch/x86/kvm/svm/svm.c | 41 +++++++++++++++-------------------------- arch/x86/kvm/svm/svm.h | 2 +- 3 files changed, 18 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 28064060413a..9b66ee34e264 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -196,7 +196,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) __set_bit(sev->asid, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { - sd = per_cpu(svm_data, cpu); + sd = per_cpu_ptr(&svm_data, cpu); sd->sev_vmcbs[sev->asid] = NULL; } @@ -2600,7 +2600,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) void pre_sev_run(struct vcpu_svm *svm, int cpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); int asid = sev_get_asid(svm->vcpu.kvm); /* Assign the asid allocated with this SEV guest */ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 0f873b298931..48274c93d78b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -245,7 +245,7 @@ struct kvm_ldttss_desc { u32 zero1; } __attribute__((packed)); -DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); +DEFINE_PER_CPU(struct svm_cpu_data, svm_data); /* * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via @@ -581,12 +581,7 @@ static int svm_hardware_enable(void) pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); return -EINVAL; } - sd = per_cpu(svm_data, me); - if (!sd) { - pr_err("%s: svm_data is NULL on %d\n", __func__, me); - return -EINVAL; - } - + sd = per_cpu_ptr(&svm_data, me); sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; @@ -646,41 +641,35 @@ static int svm_hardware_enable(void) static void svm_cpu_uninit(int cpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); - if (!sd) + if (!sd->save_area) return; - per_cpu(svm_data, cpu) = NULL; kfree(sd->sev_vmcbs); __free_page(sd->save_area); - kfree(sd); + sd->save_area = NULL; } static int svm_cpu_init(int cpu) { - struct svm_cpu_data *sd; + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); int ret = -ENOMEM; - sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); - if (!sd) - return ret; + memset(sd, 0, sizeof(struct svm_cpu_data)); sd->save_area = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!sd->save_area) - goto free_cpu_data; + return ret; ret = sev_cpu_init(sd); if (ret) goto free_save_area; - per_cpu(svm_data, cpu) = sd; - return 0; free_save_area: __free_page(sd->save_area); -free_cpu_data: - kfree(sd); + sd->save_area = NULL; return ret; } @@ -1424,7 +1413,7 @@ static void svm_clear_current_vmcb(struct vmcb *vmcb) int i; for_each_online_cpu(i) - cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); + cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); } static void svm_vcpu_free(struct kvm_vcpu *vcpu) @@ -1449,7 +1438,7 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); if (sev_es_guest(vcpu->kvm)) sev_es_unmap_ghcb(svm); @@ -1486,7 +1475,7 @@ static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_svm *svm = to_svm(vcpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); if (sd->current_vmcb != svm->vmcb) { sd->current_vmcb = svm->vmcb; @@ -3442,7 +3431,7 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) static void reload_tss(struct kvm_vcpu *vcpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); @@ -3450,7 +3439,7 @@ static void reload_tss(struct kvm_vcpu *vcpu) static void pre_svm_run(struct kvm_vcpu *vcpu) { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); /* @@ -3919,7 +3908,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) if (sev_es_guest(vcpu->kvm)) { __svm_sev_es_vcpu_run(svm); } else { - struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); + struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); /* * Use a single vmcb (vmcb01 because it's always valid) for diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 7540db9902a6..2af6a71126c1 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -293,7 +293,7 @@ struct svm_cpu_data { struct vmcb **sev_vmcbs; }; -DECLARE_PER_CPU(struct svm_cpu_data *, svm_data); +DECLARE_PER_CPU(struct svm_cpu_data, svm_data); void recalc_intercepts(struct vcpu_svm *svm); From e61ab42de874c5af8c5d98b327c77a374d9e7da1 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 7 Nov 2022 05:14:27 -0500 Subject: [PATCH 10/20] KVM: SVM: move guest vmsave/vmload back to assembly It is error-prone that code after vmexit cannot access percpu data because GSBASE has not been restored yet. It forces MSR_IA32_SPEC_CTRL save/restore to happen very late, after the predictor untraining sequence, and it gets in the way of return stack depth tracking (a retbleed mitigation that is in linux-next as of 2022-11-09). As a first step towards fixing that, move the VMCB VMSAVE/VMLOAD to assembly, essentially undoing commit fb0c4a4fee5a ("KVM: SVM: move VMLOAD/VMSAVE to C code", 2021-03-15). The reason for that commit was that it made it simpler to use a different VMCB for VMLOAD/VMSAVE versus VMRUN; but that is not a big hassle anymore thanks to the kvm-asm-offsets machinery and other related cleanups. The idea on how to number the exception tables is stolen from a prototype patch by Peter Zijlstra. Cc: stable@vger.kernel.org Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk") Link: Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm-asm-offsets.c | 1 + arch/x86/kvm/svm/svm.c | 9 ------- arch/x86/kvm/svm/vmenter.S | 49 ++++++++++++++++++++++++++-------- 3 files changed, 39 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c index f1b694e431ae..f83e88b85bf2 100644 --- a/arch/x86/kvm/kvm-asm-offsets.c +++ b/arch/x86/kvm/kvm-asm-offsets.c @@ -16,6 +16,7 @@ static void __used common(void) BLANK(); OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs); OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb); + OFFSET(SVM_vmcb01, vcpu_svm, vmcb01); OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 48274c93d78b..4e3a47eb5002 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3910,16 +3910,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) } else { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); - /* - * Use a single vmcb (vmcb01 because it's always valid) for - * context switching guest state via VMLOAD/VMSAVE, that way - * the state doesn't need to be copied between vmcb01 and - * vmcb02 when switching vmcbs for nested virtualization. - */ - vmload(svm->vmcb01.pa); __svm_vcpu_run(svm); - vmsave(svm->vmcb01.pa); - vmload(__sme_page_pa(sd->save_area)); } diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index d07bac1952c5..5bc2ed7d79c0 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -28,6 +28,8 @@ #define VCPU_R15 (SVM_vcpu_arch_regs + __VCPU_REGS_R15 * WORD_SIZE) #endif +#define SVM_vmcb01_pa (SVM_vmcb01 + KVM_VMCB_pa) + .section .noinstr.text, "ax" /** @@ -55,6 +57,16 @@ SYM_FUNC_START(__svm_vcpu_run) mov %_ASM_ARG1, %_ASM_DI .endif + /* + * Use a single vmcb (vmcb01 because it's always valid) for + * context switching guest state via VMLOAD/VMSAVE, that way + * the state doesn't need to be copied between vmcb01 and + * vmcb02 when switching vmcbs for nested virtualization. + */ + mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX +1: vmload %_ASM_AX +2: + /* Get svm->current_vmcb->pa into RAX. */ mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX @@ -80,16 +92,11 @@ SYM_FUNC_START(__svm_vcpu_run) /* Enter guest mode */ sti -1: vmrun %_ASM_AX +3: vmrun %_ASM_AX +4: + cli -2: cli - -#ifdef CONFIG_RETPOLINE - /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ - FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE -#endif - - /* "POP" @svm to RAX. */ + /* Pop @svm to RAX while it's the only available register. */ pop %_ASM_AX /* Save all guest registers. */ @@ -110,6 +117,18 @@ SYM_FUNC_START(__svm_vcpu_run) mov %r15, VCPU_R15(%_ASM_AX) #endif + /* @svm can stay in RDI from now on. */ + mov %_ASM_AX, %_ASM_DI + + mov SVM_vmcb01_pa(%_ASM_DI), %_ASM_AX +5: vmsave %_ASM_AX +6: + +#ifdef CONFIG_RETPOLINE + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE +#endif + /* * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be * untrained as soon as we exit the VM and are back to the @@ -159,11 +178,19 @@ SYM_FUNC_START(__svm_vcpu_run) pop %_ASM_BP RET -3: cmpb $0, kvm_rebooting +10: cmpb $0, kvm_rebooting jne 2b ud2 +30: cmpb $0, kvm_rebooting + jne 4b + ud2 +50: cmpb $0, kvm_rebooting + jne 6b + ud2 - _ASM_EXTABLE(1b, 3b) + _ASM_EXTABLE(1b, 10b) + _ASM_EXTABLE(3b, 30b) + _ASM_EXTABLE(5b, 50b) SYM_FUNC_END(__svm_vcpu_run) From e287bd005ad9d85dd6271dd795d3ecfb6bca46ad Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 7 Nov 2022 03:49:59 -0500 Subject: [PATCH 11/20] KVM: SVM: restore host save area from assembly Allow access to the percpu area via the GS segment base, which is needed in order to access the saved host spec_ctrl value. In linux-next FILL_RETURN_BUFFER also needs to access percpu data. For simplicity, the physical address of the save area is added to struct svm_cpu_data. Cc: stable@vger.kernel.org Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk") Reported-by: Nathan Chancellor Analyzed-by: Andrew Cooper Tested-by: Nathan Chancellor Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/kvm-asm-offsets.c | 1 + arch/x86/kvm/svm/svm.c | 14 ++++++-------- arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/svm/svm_ops.h | 5 ----- arch/x86/kvm/svm/vmenter.S | 17 +++++++++++++++++ 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c index f83e88b85bf2..1b805cd24d66 100644 --- a/arch/x86/kvm/kvm-asm-offsets.c +++ b/arch/x86/kvm/kvm-asm-offsets.c @@ -18,6 +18,7 @@ static void __used common(void) OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb); OFFSET(SVM_vmcb01, vcpu_svm, vmcb01); OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa); + OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa); } if (IS_ENABLED(CONFIG_KVM_INTEL)) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4e3a47eb5002..469c1b5617af 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -592,7 +592,7 @@ static int svm_hardware_enable(void) wrmsrl(MSR_EFER, efer | EFER_SVME); - wrmsrl(MSR_VM_HSAVE_PA, __sme_page_pa(sd->save_area)); + wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { /* @@ -648,6 +648,7 @@ static void svm_cpu_uninit(int cpu) kfree(sd->sev_vmcbs); __free_page(sd->save_area); + sd->save_area_pa = 0; sd->save_area = NULL; } @@ -665,6 +666,7 @@ static int svm_cpu_init(int cpu) if (ret) goto free_save_area; + sd->save_area_pa = __sme_page_pa(sd->save_area); return 0; free_save_area: @@ -1450,7 +1452,7 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) * Save additional host state that will be restored on VMEXIT (sev-es) * or subsequent vmload of host save area. */ - vmsave(__sme_page_pa(sd->save_area)); + vmsave(sd->save_area_pa); if (sev_es_guest(vcpu->kvm)) { struct sev_es_save_area *hostsa; hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); @@ -3905,14 +3907,10 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) guest_state_enter_irqoff(); - if (sev_es_guest(vcpu->kvm)) { + if (sev_es_guest(vcpu->kvm)) __svm_sev_es_vcpu_run(svm); - } else { - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); - + else __svm_vcpu_run(svm); - vmload(__sme_page_pa(sd->save_area)); - } guest_state_exit_irqoff(); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2af6a71126c1..83955a4e520e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -287,6 +287,8 @@ struct svm_cpu_data { struct kvm_ldttss_desc *tss_desc; struct page *save_area; + unsigned long save_area_pa; + struct vmcb *current_vmcb; /* index = sev_asid, value = vmcb pointer */ diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 9430d6437c9f..36c8af87a707 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -61,9 +61,4 @@ static __always_inline void vmsave(unsigned long pa) svm_asm1(vmsave, "a" (pa), "memory"); } -static __always_inline void vmload(unsigned long pa) -{ - svm_asm1(vmload, "a" (pa), "memory"); -} - #endif /* __KVM_X86_SVM_OPS_H */ diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 5bc2ed7d79c0..57440acfc73e 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -49,6 +49,14 @@ SYM_FUNC_START(__svm_vcpu_run) #endif push %_ASM_BX + /* + * Save variables needed after vmexit on the stack, in inverse + * order compared to when they are needed. + */ + + /* Needed to restore access to percpu variables. */ + __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa) + /* Save @svm. */ push %_ASM_ARG1 @@ -124,6 +132,11 @@ SYM_FUNC_START(__svm_vcpu_run) 5: vmsave %_ASM_AX 6: + /* Restores GSBASE among other things, allowing access to percpu data. */ + pop %_ASM_AX +7: vmload %_ASM_AX +8: + #ifdef CONFIG_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE @@ -187,10 +200,14 @@ SYM_FUNC_START(__svm_vcpu_run) 50: cmpb $0, kvm_rebooting jne 6b ud2 +70: cmpb $0, kvm_rebooting + jne 8b + ud2 _ASM_EXTABLE(1b, 10b) _ASM_EXTABLE(3b, 30b) _ASM_EXTABLE(5b, 50b) + _ASM_EXTABLE(7b, 70b) SYM_FUNC_END(__svm_vcpu_run) From 9f2febf3f04daebdaaa5a43cfa20e3844905c0f9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 30 Sep 2022 14:24:40 -0400 Subject: [PATCH 12/20] KVM: SVM: move MSR_IA32_SPEC_CTRL save/restore to assembly Restoration of the host IA32_SPEC_CTRL value is probably too late with respect to the return thunk training sequence. With respect to the user/kernel boundary, AMD says, "If software chooses to toggle STIBP (e.g., set STIBP on kernel entry, and clear it on kernel exit), software should set STIBP to 1 before executing the return thunk training sequence." I assume the same requirements apply to the guest/host boundary. The return thunk training sequence is in vmenter.S, quite close to the VM-exit. On hosts without V_SPEC_CTRL, however, the host's IA32_SPEC_CTRL value is not restored until much later. To avoid this, move the restoration of host SPEC_CTRL to assembly and, for consistency, move the restoration of the guest SPEC_CTRL as well. This is not particularly difficult, apart from some care to cover both 32- and 64-bit, and to share code between SEV-ES and normal vmentry. Cc: stable@vger.kernel.org Fixes: a149180fbcf3 ("x86: Add magic AMD return-thunk") Suggested-by: Jim Mattson Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kernel/cpu/bugs.c | 13 +--- arch/x86/kvm/kvm-asm-offsets.c | 1 + arch/x86/kvm/svm/svm.c | 37 ++++------ arch/x86/kvm/svm/svm.h | 4 +- arch/x86/kvm/svm/vmenter.S | 119 ++++++++++++++++++++++++++++++++- 5 files changed, 136 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index da7c361f47e0..6ec0b7ce7453 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -196,22 +196,15 @@ void __init check_bugs(void) } /* - * NOTE: This function is *only* called for SVM. VMX spec_ctrl handling is - * done in vmenter.S. + * NOTE: This function is *only* called for SVM, since Intel uses + * MSR_IA32_SPEC_CTRL for SSBD. */ void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) { - u64 msrval, guestval = guest_spec_ctrl, hostval = spec_ctrl_current(); + u64 guestval, hostval; struct thread_info *ti = current_thread_info(); - if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { - if (hostval != guestval) { - msrval = setguest ? guestval : hostval; - wrmsrl(MSR_IA32_SPEC_CTRL, msrval); - } - } - /* * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. diff --git a/arch/x86/kvm/kvm-asm-offsets.c b/arch/x86/kvm/kvm-asm-offsets.c index 1b805cd24d66..24a710d37323 100644 --- a/arch/x86/kvm/kvm-asm-offsets.c +++ b/arch/x86/kvm/kvm-asm-offsets.c @@ -16,6 +16,7 @@ static void __used common(void) BLANK(); OFFSET(SVM_vcpu_arch_regs, vcpu_svm, vcpu.arch.regs); OFFSET(SVM_current_vmcb, vcpu_svm, current_vmcb); + OFFSET(SVM_spec_ctrl, vcpu_svm, spec_ctrl); OFFSET(SVM_vmcb01, vcpu_svm, vmcb01); OFFSET(KVM_VMCB_pa, kvm_vmcb_info, pa); OFFSET(SD_save_area_pa, svm_cpu_data, save_area_pa); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 469c1b5617af..cf1aed25f4ab 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -720,6 +720,15 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) u32 offset; u32 *msrpm; + /* + * For non-nested case: + * If the L01 MSR bitmap does not intercept the MSR, then we need to + * save it. + * + * For nested case: + * If the L02 MSR bitmap does not intercept the MSR, then we need to + * save it. + */ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: to_svm(vcpu)->msrpm; @@ -3901,16 +3910,16 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; } -static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) { struct vcpu_svm *svm = to_svm(vcpu); guest_state_enter_irqoff(); if (sev_es_guest(vcpu->kvm)) - __svm_sev_es_vcpu_run(svm); + __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted); else - __svm_vcpu_run(svm); + __svm_vcpu_run(svm, spec_ctrl_intercepted); guest_state_exit_irqoff(); } @@ -3918,6 +3927,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); trace_kvm_entry(vcpu); @@ -3976,26 +3986,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - svm_vcpu_enter_exit(vcpu); - - /* - * We do not use IBRS in the kernel. If this vCPU has used the - * SPEC_CTRL MSR it may have left it on; save the value and - * turn it off. This is much more efficient than blindly adding - * it to the atomic save/restore list. Especially as the former - * (Saving guest MSRs on vmexit) doesn't even exist in KVM. - * - * For non-nested case: - * If the L01 MSR bitmap does not intercept the MSR, then we need to - * save it. - * - * For nested case: - * If the L02 MSR bitmap does not intercept the MSR, then we need to - * save it. - */ - if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL) && - unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) - svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); + svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); if (!sev_es_guest(vcpu->kvm)) reload_tss(vcpu); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 83955a4e520e..199a2ecef1ce 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -682,7 +682,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ -void __svm_sev_es_vcpu_run(struct vcpu_svm *svm); -void __svm_vcpu_run(struct vcpu_svm *svm); +void __svm_sev_es_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); +void __svm_vcpu_run(struct vcpu_svm *svm, bool spec_ctrl_intercepted); #endif diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index 57440acfc73e..34367dc203f2 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -32,9 +32,69 @@ .section .noinstr.text, "ax" +.macro RESTORE_GUEST_SPEC_CTRL + /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */ + ALTERNATIVE_2 "", \ + "jmp 800f", X86_FEATURE_MSR_SPEC_CTRL, \ + "", X86_FEATURE_V_SPEC_CTRL +801: +.endm +.macro RESTORE_GUEST_SPEC_CTRL_BODY +800: + /* + * SPEC_CTRL handling: if the guest's SPEC_CTRL value differs from the + * host's, write the MSR. This is kept out-of-line so that the common + * case does not have to jump. + * + * IMPORTANT: To avoid RSB underflow attacks and any other nastiness, + * there must not be any returns or indirect branches between this code + * and vmentry. + */ + movl SVM_spec_ctrl(%_ASM_DI), %eax + cmp PER_CPU_VAR(x86_spec_ctrl_current), %eax + je 801b + mov $MSR_IA32_SPEC_CTRL, %ecx + xor %edx, %edx + wrmsr + jmp 801b +.endm + +.macro RESTORE_HOST_SPEC_CTRL + /* No need to do anything if SPEC_CTRL is unset or V_SPEC_CTRL is set */ + ALTERNATIVE_2 "", \ + "jmp 900f", X86_FEATURE_MSR_SPEC_CTRL, \ + "", X86_FEATURE_V_SPEC_CTRL +901: +.endm +.macro RESTORE_HOST_SPEC_CTRL_BODY +900: + /* Same for after vmexit. */ + mov $MSR_IA32_SPEC_CTRL, %ecx + + /* + * Load the value that the guest had written into MSR_IA32_SPEC_CTRL, + * if it was not intercepted during guest execution. + */ + cmpb $0, (%_ASM_SP) + jnz 998f + rdmsr + movl %eax, SVM_spec_ctrl(%_ASM_DI) +998: + + /* Now restore the host value of the MSR if different from the guest's. */ + movl PER_CPU_VAR(x86_spec_ctrl_current), %eax + cmp SVM_spec_ctrl(%_ASM_DI), %eax + je 901b + xor %edx, %edx + wrmsr + jmp 901b +.endm + + /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * + * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_vcpu_run) push %_ASM_BP @@ -54,17 +114,26 @@ SYM_FUNC_START(__svm_vcpu_run) * order compared to when they are needed. */ + /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ + push %_ASM_ARG2 + /* Needed to restore access to percpu variables. */ __ASM_SIZE(push) PER_CPU_VAR(svm_data + SD_save_area_pa) - /* Save @svm. */ + /* Finally save @svm. */ push %_ASM_ARG1 .ifnc _ASM_ARG1, _ASM_DI - /* Move @svm to RDI. */ + /* + * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX + * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. + */ mov %_ASM_ARG1, %_ASM_DI .endif + /* Clobbers RAX, RCX, RDX. */ + RESTORE_GUEST_SPEC_CTRL + /* * Use a single vmcb (vmcb01 because it's always valid) for * context switching guest state via VMLOAD/VMSAVE, that way @@ -142,6 +211,9 @@ SYM_FUNC_START(__svm_vcpu_run) FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif + /* Clobbers RAX, RCX, RDX. */ + RESTORE_HOST_SPEC_CTRL + /* * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be * untrained as soon as we exit the VM and are back to the @@ -177,6 +249,9 @@ SYM_FUNC_START(__svm_vcpu_run) xor %r15d, %r15d #endif + /* "Pop" @spec_ctrl_intercepted. */ + pop %_ASM_BX + pop %_ASM_BX #ifdef CONFIG_X86_64 @@ -191,6 +266,9 @@ SYM_FUNC_START(__svm_vcpu_run) pop %_ASM_BP RET + RESTORE_GUEST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY + 10: cmpb $0, kvm_rebooting jne 2b ud2 @@ -214,6 +292,7 @@ SYM_FUNC_END(__svm_vcpu_run) /** * __svm_sev_es_vcpu_run - Run a SEV-ES vCPU via a transition to SVM guest mode * @svm: struct vcpu_svm * + * @spec_ctrl_intercepted: bool */ SYM_FUNC_START(__svm_sev_es_vcpu_run) push %_ASM_BP @@ -228,8 +307,30 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) #endif push %_ASM_BX + /* + * Save variables needed after vmexit on the stack, in inverse + * order compared to when they are needed. + */ + + /* Accessed directly from the stack in RESTORE_HOST_SPEC_CTRL. */ + push %_ASM_ARG2 + + /* Save @svm. */ + push %_ASM_ARG1 + +.ifnc _ASM_ARG1, _ASM_DI + /* + * Stash @svm in RDI early. On 32-bit, arguments are in RAX, RCX + * and RDX which are clobbered by RESTORE_GUEST_SPEC_CTRL. + */ + mov %_ASM_ARG1, %_ASM_DI +.endif + + /* Clobbers RAX, RCX, RDX. */ + RESTORE_GUEST_SPEC_CTRL + /* Get svm->current_vmcb->pa into RAX. */ - mov SVM_current_vmcb(%_ASM_ARG1), %_ASM_AX + mov SVM_current_vmcb(%_ASM_DI), %_ASM_AX mov KVM_VMCB_pa(%_ASM_AX), %_ASM_AX /* Enter guest mode */ @@ -239,11 +340,17 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) 2: cli + /* Pop @svm to RDI, guest registers have been saved already. */ + pop %_ASM_DI + #ifdef CONFIG_RETPOLINE /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE #endif + /* Clobbers RAX, RCX, RDX. */ + RESTORE_HOST_SPEC_CTRL + /* * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be * untrained as soon as we exit the VM and are back to the @@ -253,6 +360,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) */ UNTRAIN_RET + /* "Pop" @spec_ctrl_intercepted. */ + pop %_ASM_BX + pop %_ASM_BX #ifdef CONFIG_X86_64 @@ -267,6 +377,9 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run) pop %_ASM_BP RET + RESTORE_GUEST_SPEC_CTRL_BODY + RESTORE_HOST_SPEC_CTRL_BODY + 3: cmpb $0, kvm_rebooting jne 2b ud2 From bd3d394e367e66e773a6cb25a82c29b04464230b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 30 Sep 2022 14:48:24 -0400 Subject: [PATCH 13/20] x86, KVM: remove unnecessary argument to x86_virt_spec_ctrl and callers x86_virt_spec_ctrl only deals with the paravirtualized MSR_IA32_VIRT_SPEC_CTRL now and does not handle MSR_IA32_SPEC_CTRL anymore; remove the corresponding, unused argument. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/spec-ctrl.h | 10 +++++----- arch/x86/kernel/cpu/bugs.c | 2 +- arch/x86/kvm/svm/svm.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h index 5393babc0598..cb0386fc4dc3 100644 --- a/arch/x86/include/asm/spec-ctrl.h +++ b/arch/x86/include/asm/spec-ctrl.h @@ -13,7 +13,7 @@ * Takes the guest view of SPEC_CTRL MSR as a parameter and also * the guest's version of VIRT_SPEC_CTRL, if emulated. */ -extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest); +extern void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool guest); /** * x86_spec_ctrl_set_guest - Set speculation control registers for the guest @@ -24,9 +24,9 @@ extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bo * Avoids writing to the MSR if the content/bits are the same */ static inline -void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +void x86_spec_ctrl_set_guest(u64 guest_virt_spec_ctrl) { - x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true); + x86_virt_spec_ctrl(guest_virt_spec_ctrl, true); } /** @@ -38,9 +38,9 @@ void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) * Avoids writing to the MSR if the content/bits are the same */ static inline -void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +void x86_spec_ctrl_restore_host(u64 guest_virt_spec_ctrl) { - x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false); + x86_virt_spec_ctrl(guest_virt_spec_ctrl, false); } /* AMD specific Speculative Store Bypass MSR data */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 6ec0b7ce7453..3e3230cccaa7 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -200,7 +200,7 @@ void __init check_bugs(void) * MSR_IA32_SPEC_CTRL for SSBD. */ void -x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) { u64 guestval, hostval; struct thread_info *ti = current_thread_info(); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index cf1aed25f4ab..9f88c8e6766e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3984,7 +3984,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * being speculatively taken. */ if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); + x86_spec_ctrl_set_guest(svm->virt_spec_ctrl); svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); @@ -3992,7 +3992,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) reload_tss(vcpu); if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); + x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); if (!sev_es_guest(vcpu->kvm)) { vcpu->arch.cr2 = svm->vmcb->save.cr2; From 8e1071d0ad300fcce2e2b4e46cb15e41d0166bdc Mon Sep 17 00:00:00 2001 From: Matthias Gerstner Date: Thu, 3 Nov 2022 14:59:27 +0100 Subject: [PATCH 14/20] tools/kvm_stat: fix incorrect detection of debugfs The first field in /proc/mounts can be influenced by unprivileged users through the widespread `fusermount` setuid-root program. Example: ``` user$ mkdir ~/mydebugfs user$ export _FUSE_COMMFD=0 user$ fusermount ~/mydebugfs -ononempty,fsname=debugfs user$ grep debugfs /proc/mounts debugfs /home/user/mydebugfs fuse rw,nosuid,nodev,relatime,user_id=1000,group_id=100 0 0 ``` If there is no debugfs already mounted in the system then this can be used by unprivileged users to trick kvm_stat into using a user controlled file system location for obtaining KVM statistics. Even though the root user is not allowed to access non-root FUSE mounts for security reasons, the unprivileged user can unmount the FUSE mount before kvm_stat uses the mounted path. If it wins the race, kvm_stat will read from the location where the FUSE mount resided. Note that the files in debugfs are only opened for reading, so the attacker can cause very large data to be read in by kvm_stat, or fake data to be processed, but there should be no viable way to turn this into a privilege escalation. The fix is simply to use the file system type field instead. Whitespace in the mount path is escaped in /proc/mounts thus no further safety measures in the parsing should be necessary to make this correct. Message-Id: <20221103135927.13656-1-matthias.gerstner@suse.de> Signed-off-by: Matthias Gerstner Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 9c366b3a676d..88a73999aa58 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1756,7 +1756,7 @@ def assign_globals(): debugfs = '' for line in open('/proc/mounts'): - if line.split(' ')[0] == 'debugfs': + if line.split(' ')[2] == 'debugfs': debugfs = line.split(' ')[1] break if debugfs == '': From 2c1b54348a63096d4d4677743fba7a6d5fda9476 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 7 Nov 2022 22:52:49 +0800 Subject: [PATCH 15/20] tools/kvm_stat: update exit reasons for vmx/svm/aarch64/userspace Update EXIT_REASONS from source, including VMX_EXIT_REASONS, SVM_EXIT_REASONS, AARCH64_EXIT_REASONS, USERSPACE_EXIT_REASONS. Signed-off-by: Rong Tao Message-Id: Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 96 +++++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 14 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 88a73999aa58..6f28180ffeea 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -41,11 +41,14 @@ VMX_EXIT_REASONS = { 'EXCEPTION_NMI': 0, 'EXTERNAL_INTERRUPT': 1, 'TRIPLE_FAULT': 2, - 'PENDING_INTERRUPT': 7, + 'INIT_SIGNAL': 3, + 'SIPI_SIGNAL': 4, + 'INTERRUPT_WINDOW': 7, 'NMI_WINDOW': 8, 'TASK_SWITCH': 9, 'CPUID': 10, 'HLT': 12, + 'INVD': 13, 'INVLPG': 14, 'RDPMC': 15, 'RDTSC': 16, @@ -65,26 +68,48 @@ VMX_EXIT_REASONS = { 'MSR_READ': 31, 'MSR_WRITE': 32, 'INVALID_STATE': 33, + 'MSR_LOAD_FAIL': 34, 'MWAIT_INSTRUCTION': 36, + 'MONITOR_TRAP_FLAG': 37, 'MONITOR_INSTRUCTION': 39, 'PAUSE_INSTRUCTION': 40, 'MCE_DURING_VMENTRY': 41, 'TPR_BELOW_THRESHOLD': 43, 'APIC_ACCESS': 44, + 'EOI_INDUCED': 45, + 'GDTR_IDTR': 46, + 'LDTR_TR': 47, 'EPT_VIOLATION': 48, 'EPT_MISCONFIG': 49, + 'INVEPT': 50, + 'RDTSCP': 51, + 'PREEMPTION_TIMER': 52, + 'INVVPID': 53, 'WBINVD': 54, 'XSETBV': 55, 'APIC_WRITE': 56, + 'RDRAND': 57, 'INVPCID': 58, + 'VMFUNC': 59, + 'ENCLS': 60, + 'RDSEED': 61, + 'PML_FULL': 62, + 'XSAVES': 63, + 'XRSTORS': 64, + 'UMWAIT': 67, + 'TPAUSE': 68, + 'BUS_LOCK': 74, + 'NOTIFY': 75, } SVM_EXIT_REASONS = { 'READ_CR0': 0x000, + 'READ_CR2': 0x002, 'READ_CR3': 0x003, 'READ_CR4': 0x004, 'READ_CR8': 0x008, 'WRITE_CR0': 0x010, + 'WRITE_CR2': 0x012, 'WRITE_CR3': 0x013, 'WRITE_CR4': 0x014, 'WRITE_CR8': 0x018, @@ -105,6 +130,7 @@ SVM_EXIT_REASONS = { 'WRITE_DR6': 0x036, 'WRITE_DR7': 0x037, 'EXCP_BASE': 0x040, + 'LAST_EXCP': 0x05f, 'INTR': 0x060, 'NMI': 0x061, 'SMI': 0x062, @@ -151,21 +177,45 @@ SVM_EXIT_REASONS = { 'MWAIT': 0x08b, 'MWAIT_COND': 0x08c, 'XSETBV': 0x08d, + 'RDPRU': 0x08e, + 'EFER_WRITE_TRAP': 0x08f, + 'CR0_WRITE_TRAP': 0x090, + 'CR1_WRITE_TRAP': 0x091, + 'CR2_WRITE_TRAP': 0x092, + 'CR3_WRITE_TRAP': 0x093, + 'CR4_WRITE_TRAP': 0x094, + 'CR5_WRITE_TRAP': 0x095, + 'CR6_WRITE_TRAP': 0x096, + 'CR7_WRITE_TRAP': 0x097, + 'CR8_WRITE_TRAP': 0x098, + 'CR9_WRITE_TRAP': 0x099, + 'CR10_WRITE_TRAP': 0x09a, + 'CR11_WRITE_TRAP': 0x09b, + 'CR12_WRITE_TRAP': 0x09c, + 'CR13_WRITE_TRAP': 0x09d, + 'CR14_WRITE_TRAP': 0x09e, + 'CR15_WRITE_TRAP': 0x09f, + 'INVPCID': 0x0a2, 'NPF': 0x400, + 'AVIC_INCOMPLETE_IPI': 0x401, + 'AVIC_UNACCELERATED_ACCESS': 0x402, + 'VMGEXIT': 0x403, } -# EC definition of HSR (from arch/arm64/include/asm/kvm_arm.h) +# EC definition of HSR (from arch/arm64/include/asm/esr.h) AARCH64_EXIT_REASONS = { 'UNKNOWN': 0x00, - 'WFI': 0x01, + 'WFx': 0x01, 'CP15_32': 0x03, 'CP15_64': 0x04, 'CP14_MR': 0x05, 'CP14_LS': 0x06, 'FP_ASIMD': 0x07, 'CP10_ID': 0x08, + 'PAC': 0x09, 'CP14_64': 0x0C, - 'ILL_ISS': 0x0E, + 'BTI': 0x0D, + 'ILL': 0x0E, 'SVC32': 0x11, 'HVC32': 0x12, 'SMC32': 0x13, @@ -173,21 +223,26 @@ AARCH64_EXIT_REASONS = { 'HVC64': 0x16, 'SMC64': 0x17, 'SYS64': 0x18, - 'IABT': 0x20, - 'IABT_HYP': 0x21, + 'SVE': 0x19, + 'ERET': 0x1A, + 'FPAC': 0x1C, + 'SME': 0x1D, + 'IMP_DEF': 0x1F, + 'IABT_LOW': 0x20, + 'IABT_CUR': 0x21, 'PC_ALIGN': 0x22, - 'DABT': 0x24, - 'DABT_HYP': 0x25, + 'DABT_LOW': 0x24, + 'DABT_CUR': 0x25, 'SP_ALIGN': 0x26, 'FP_EXC32': 0x28, 'FP_EXC64': 0x2C, 'SERROR': 0x2F, - 'BREAKPT': 0x30, - 'BREAKPT_HYP': 0x31, - 'SOFTSTP': 0x32, - 'SOFTSTP_HYP': 0x33, - 'WATCHPT': 0x34, - 'WATCHPT_HYP': 0x35, + 'BREAKPT_LOW': 0x30, + 'BREAKPT_CUR': 0x31, + 'SOFTSTP_LOW': 0x32, + 'SOFTSTP_CUR': 0x33, + 'WATCHPT_LOW': 0x34, + 'WATCHPT_CUR': 0x35, 'BKPT32': 0x38, 'VECTOR32': 0x3A, 'BRK64': 0x3C, @@ -220,6 +275,19 @@ USERSPACE_EXIT_REASONS = { 'S390_TSCH': 22, 'EPR': 23, 'SYSTEM_EVENT': 24, + 'S390_STSI': 25, + 'IOAPIC_EOI': 26, + 'HYPERV': 27, + 'ARM_NISV': 28, + 'X86_RDMSR': 29, + 'X86_WRMSR': 30, + 'DIRTY_RING_FULL': 31, + 'AP_RESET_HOLD': 32, + 'X86_BUS_LOCK': 33, + 'XEN': 34, + 'RISCV_SBI': 35, + 'RISCV_CSR': 36, + 'NOTIFY': 37, } IOCTL_NUMBERS = { From 0bd8bd2f7a789fe1dcb21ad148199d2f62d79873 Mon Sep 17 00:00:00 2001 From: Peter Gonda Date: Fri, 4 Nov 2022 07:22:20 -0700 Subject: [PATCH 16/20] KVM: SVM: Only dump VMSA to klog at KERN_DEBUG level Explicitly print the VMSA dump at KERN_DEBUG log level, KERN_CONT uses KERNEL_DEFAULT if the previous log line has a newline, i.e. if there's nothing to continuing, and as a result the VMSA gets dumped when it shouldn't. The KERN_CONT documentation says it defaults back to KERNL_DEFAULT if the previous log line has a newline. So switch from KERN_CONT to print_hex_dump_debug(). Jarkko pointed this out in reference to the original patch. See: https://lore.kernel.org/all/YuPMeWX4uuR1Tz3M@kernel.org/ print_hex_dump(KERN_DEBUG, ...) was pointed out there, but print_hex_dump_debug() should similar. Fixes: 6fac42f127b8 ("KVM: SVM: Dump Virtual Machine Save Area (VMSA) to klog") Signed-off-by: Peter Gonda Reviewed-by: Sean Christopherson Cc: Jarkko Sakkinen Cc: Harald Hoyer Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Message-Id: <20221104142220.469452-1-pgonda@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 9b66ee34e264..efaaef2b7ae1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -605,7 +605,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->dr6 = svm->vcpu.arch.dr6; pr_debug("Virtual Machine Save Area (VMSA):\n"); - print_hex_dump(KERN_CONT, "", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); + print_hex_dump_debug("", DUMP_PREFIX_NONE, 16, 1, save, sizeof(*save), false); return 0; } From 8631ef59b62290c7d88e7209e35dfb47f33f4902 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 19 Sep 2022 17:10:06 +0800 Subject: [PATCH 17/20] KVM: x86/pmu: Do not speculatively query Intel GP PMCs that don't exist yet The SDM lists an architectural MSR IA32_CORE_CAPABILITIES (0xCF) that limits the theoretical maximum value of the Intel GP PMC MSRs allocated at 0xC1 to 14; likewise the Intel April 2022 SDM adds IA32_OVERCLOCKING_STATUS at 0x195 which limits the number of event selection MSRs to 15 (0x186-0x194). Limiting the maximum number of counters to 14 or 18 based on the currently allocated MSRs is clearly fragile, and it seems likely that Intel will even place PMCs 8-15 at a completely different range of MSR indices. So stop at the maximum number of GP PMCs supported today on Intel processors. There are some machines, like Intel P4 with non Architectural PMU, that may indeed have 18 counters, but those counters are in a completely different MSR address range and are not supported by KVM. Cc: Vitaly Kuznetsov Cc: stable@vger.kernel.org Fixes: cf05a67b68b8 ("KVM: x86: omit "impossible" pmu MSRs from MSR list") Suggested-by: Jim Mattson Signed-off-by: Like Xu Reviewed-by: Jim Mattson Message-Id: <20220919091008.60695-1-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f5eb577d583..73716fab120f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1442,20 +1442,10 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, - MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, - MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, - MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, - MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, - MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, - MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, - MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, - MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, - MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, @@ -7041,12 +7031,12 @@ static void kvm_init_msr_list(void) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; From 4f1fa2a1bbeb2feca436d2c86bf6f78dc4e5e4c4 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 19 Sep 2022 17:10:07 +0800 Subject: [PATCH 18/20] KVM: x86/pmu: Limit the maximum number of supported Intel GP counters The Intel Architectural IA32_PMCx MSRs addresses range allows for a maximum of 8 GP counters, and KVM cannot address any more. Introduce a local macro (named KVM_INTEL_PMC_MAX_GENERIC) and use it consistently to refer to the number of counters supported by KVM, thus avoiding possible out-of-bound accesses. Suggested-by: Jim Mattson Signed-off-by: Like Xu Reviewed-by: Jim Mattson Message-Id: <20220919091008.60695-2-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 +++++- arch/x86/kvm/pmu.c | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 4 ++-- arch/x86/kvm/x86.c | 12 +++++++----- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7551b6f9c31c..286f6130dcb4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -501,6 +501,10 @@ struct kvm_pmc { bool intr; }; +/* More counters may conflict with other existing Architectural MSRs */ +#define KVM_INTEL_PMC_MAX_GENERIC 8 +#define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) +#define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 struct kvm_pmu { unsigned nr_arch_gp_counters; @@ -516,7 +520,7 @@ struct kvm_pmu { u64 reserved_bits; u64 raw_event_mask; u8 version; - struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; struct irq_work irq_work; DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index d9b9a0f0db17..de1fd7369736 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -56,7 +56,7 @@ static const struct x86_cpu_id vmx_icl_pebs_cpu[] = { * code. Each pmc, stored in kvm_pmc.idx field, is unique across * all perf counters (both gp and fixed). The mapping relationship * between pmc and perf counters is as the following: - * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters + * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 25b70a85bef5..10b33da9bd05 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -617,7 +617,7 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct lbr_desc *lbr_desc = vcpu_to_lbr_desc(vcpu); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -643,7 +643,7 @@ static void intel_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmc *pmc = NULL; int i; - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + for (i = 0; i < KVM_INTEL_PMC_MAX_GENERIC; i++) { pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 73716fab120f..1e25f410d1fe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1438,6 +1438,9 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, + + /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, @@ -1446,7 +1449,6 @@ static const u32 msrs_to_save_all[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, - MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, @@ -7031,14 +7033,14 @@ static void kvm_init_msr_list(void) intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; - case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 7: + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= - min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; - case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 7: + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= - min(INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) + min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp)) continue; break; case MSR_IA32_XFD: From 556f3c9ad7c101aa16a43ef4539f3aabc1d7b32e Mon Sep 17 00:00:00 2001 From: Like Xu Date: Mon, 19 Sep 2022 17:10:08 +0800 Subject: [PATCH 19/20] KVM: x86/pmu: Limit the maximum number of supported AMD GP counters The AMD PerfMonV2 specification allows for a maximum of 16 GP counters, but currently only 6 pairs of MSRs are accepted by KVM. While AMD64_NUM_COUNTERS_CORE is already equal to 6, increasing without adjusting msrs_to_save_all[] could result in out-of-bounds accesses. Therefore introduce a macro (named KVM_AMD_PMC_MAX_GENERIC) to refer to the number of counters supported by KVM. Signed-off-by: Like Xu Reviewed-by: Jim Mattson Message-Id: <20220919091008.60695-3-likexu@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/pmu.c | 7 ++++--- arch/x86/kvm/x86.c | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 286f6130dcb4..f05ebaa26f0f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -506,6 +506,7 @@ struct kvm_pmc { #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 +#define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index b68956299fa8..9d65cd095691 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -192,9 +192,10 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - BUILD_BUG_ON(AMD64_NUM_COUNTERS_CORE > INTEL_PMC_MAX_GENERIC); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > AMD64_NUM_COUNTERS_CORE); + BUILD_BUG_ON(KVM_AMD_PMC_MAX_GENERIC > INTEL_PMC_MAX_GENERIC); - for (i = 0; i < AMD64_NUM_COUNTERS_CORE ; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC ; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; @@ -207,7 +208,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); int i; - for (i = 0; i < AMD64_NUM_COUNTERS_CORE; i++) { + for (i = 0; i < KVM_AMD_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1e25f410d1fe..ecea83f0da49 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1452,10 +1452,13 @@ static const u32 msrs_to_save_all[] = { MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + + /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, }; From 6d3085e4d89ad7e6c7f1c6cf929d903393565861 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 11 Nov 2022 00:18:41 +0000 Subject: [PATCH 20/20] KVM: x86/mmu: Block all page faults during kvm_zap_gfn_range() When zapping a GFN range, pass 0 => ALL_ONES for the to-be-invalidated range to effectively block all page faults while the zap is in-progress. The invalidation helpers take a host virtual address, whereas zapping a GFN obviously provides a guest physical address and with the wrong unit of measurement (frame vs. byte). Alternatively, KVM could walk all memslots to get the associated HVAs, but thanks to SMM, that would require multiple lookups. And practically speaking, kvm_zap_gfn_range() usage is quite rare and not a hot path, e.g. MTRR and CR0.CD are almost guaranteed to be done only on vCPU0 during boot, and APICv inhibits are similarly infrequent operations. Fixes: edb298c663fc ("KVM: x86/mmu: bump mmu notifier count in kvm_zap_gfn_range") Reported-by: Chao Peng Cc: stable@vger.kernel.org Cc: Maxim Levitsky Signed-off-by: Sean Christopherson Message-Id: <20221111001841.2412598-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6f81539061d6..1ccb769f62af 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6056,7 +6056,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_mmu_invalidate_begin(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_begin(kvm, 0, -1ul); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6070,7 +6070,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end - gfn_start); - kvm_mmu_invalidate_end(kvm, gfn_start, gfn_end); + kvm_mmu_invalidate_end(kvm, 0, -1ul); write_unlock(&kvm->mmu_lock); }