From 6f5c9600621b4efb5c61b482d767432eb1ad3a9c Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 13 Mar 2024 13:58:42 +0100 Subject: [PATCH 1/6] KVM: x86: Don't advertise guest.MAXPHYADDR as host.MAXPHYADDR in CPUID Drop KVM's propagation of GuestPhysBits (CPUID leaf 80000008, EAX[23:16]) to HostPhysBits (same leaf, EAX[7:0]) when advertising the address widths to userspace via KVM_GET_SUPPORTED_CPUID. Per AMD, GuestPhysBits is intended for software use, and physical CPUs do not set that field. I.e. GuestPhysBits will be non-zero if and only if KVM is running as a nested hypervisor, and in that case, GuestPhysBits is NOT guaranteed to capture the CPU's effective MAXPHYADDR when running with TDP enabled. E.g. KVM will soon use GuestPhysBits to communicate the CPU's maximum *addressable* guest physical address, which would result in KVM under- reporting PhysBits when running as an L1 on a CPU with MAXPHYADDR=52, but without 5-level paging. Signed-off-by: Gerd Hoffmann Cc: stable@vger.kernel.org Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240313125844.912415-2-kraxel@redhat.com [sean: rewrite changelog with --verbose, Cc stable@] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bfc0bfcb2bc6..d1cbb14f8553 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1231,9 +1231,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = 0; break; case 0x80000008: { - unsigned g_phys_as = (entry->eax >> 16) & 0xff; - unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); - unsigned phys_as = entry->eax & 0xff; + unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned int phys_as; /* * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as @@ -1241,16 +1240,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * reductions in MAXPHYADDR for memory encryption affect shadow * paging, too. * - * If TDP is enabled but an explicit guest MAXPHYADDR is not - * provided, use the raw bare metal MAXPHYADDR as reductions to - * the HPAs do not affect GPAs. + * If TDP is enabled, use the raw bare metal MAXPHYADDR as + * reductions to the HPAs do not affect GPAs. */ - if (!tdp_enabled) - g_phys_as = boot_cpu_data.x86_phys_bits; - else if (!g_phys_as) - g_phys_as = phys_as; + if (!tdp_enabled) { + phys_as = boot_cpu_data.x86_phys_bits; + } else { + phys_as = entry->eax & 0xff; + } - entry->eax = g_phys_as | (virt_as << 8); + entry->eax = phys_as | (virt_as << 8); entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); From b628cb523c65420031b310050a3733aa7fbe2e88 Mon Sep 17 00:00:00 2001 From: Gerd Hoffmann Date: Wed, 13 Mar 2024 13:58:43 +0100 Subject: [PATCH 2/6] KVM: x86: Advertise max mappable GPA in CPUID.0x80000008.GuestPhysBits Use the GuestPhysBits field in CPUID.0x80000008 to communicate the max mappable GPA to userspace, i.e. the max GPA that is addressable by the CPU itself. Typically this is identical to the max effective GPA, except in the case where the CPU supports MAXPHYADDR > 48 but does not support 5-level TDP (the CPU consults bits 51:48 of the GPA only when walking the fifth level TDP page table entry). Enumerating the max mappable GPA via CPUID will allow guest firmware to map resources like PCI bars in the highest possible address space, while ensuring that the GPA is addressable by the CPU. Without precise knowledge about the max mappable GPA, the guest must assume that 5-level paging is unsupported and thus restrict its mappings to the lower 48 bits. Advertise the max mappable GPA via KVM_GET_SUPPORTED_CPUID as userspace doesn't have easy access to whether or not 5-level paging is supported, and to play nice with userspace VMMs that reflect the supported CPUID directly into the guest. AMD's APM (3.35) defines GuestPhysBits (EAX[23:16]) as: Maximum guest physical address size in bits. This number applies only to guests using nested paging. When this field is zero, refer to the PhysAddrSize field for the maximum guest physical address size. Tom Lendacky confirmed that the purpose of GuestPhysBits is software use and KVM can use it as described above. Real hardware always returns zero. Leave GuestPhysBits as '0' when TDP is disabled in order to comply with the APM's statement that GuestPhysBits "applies only to guest using nested paging". As above, guest firmware will likely create suboptimal mappings, but that is a very minor issue and not a functional concern. Signed-off-by: Gerd Hoffmann Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20240313125844.912415-3-kraxel@redhat.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 28 +++++++++++++++++++++++++--- arch/x86/kvm/mmu.h | 2 ++ arch/x86/kvm/mmu/mmu.c | 5 +++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index d1cbb14f8553..1c5583addc90 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -1231,8 +1231,22 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->eax = entry->ebx = entry->ecx = 0; break; case 0x80000008: { + /* + * GuestPhysAddrSize (EAX[23:16]) is intended for software + * use. + * + * KVM's ABI is to report the effective MAXPHYADDR for the + * guest in PhysAddrSize (phys_as), and the maximum + * *addressable* GPA in GuestPhysAddrSize (g_phys_as). + * + * GuestPhysAddrSize is valid if and only if TDP is enabled, + * in which case the max GPA that can be addressed by KVM may + * be less than the max GPA that can be legally generated by + * the guest, e.g. if MAXPHYADDR>48 but the CPU doesn't + * support 5-level TDP. + */ unsigned int virt_as = max((entry->eax >> 8) & 0xff, 48U); - unsigned int phys_as; + unsigned int phys_as, g_phys_as; /* * If TDP (NPT) is disabled use the adjusted host MAXPHYADDR as @@ -1241,15 +1255,23 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) * paging, too. * * If TDP is enabled, use the raw bare metal MAXPHYADDR as - * reductions to the HPAs do not affect GPAs. + * reductions to the HPAs do not affect GPAs. The max + * addressable GPA is the same as the max effective GPA, except + * that it's capped at 48 bits if 5-level TDP isn't supported + * (hardware processes bits 51:48 only when walking the fifth + * level page table). */ if (!tdp_enabled) { phys_as = boot_cpu_data.x86_phys_bits; + g_phys_as = 0; } else { phys_as = entry->eax & 0xff; + g_phys_as = phys_as; + if (kvm_mmu_get_max_tdp_level() < 5) + g_phys_as = min(g_phys_as, 48); } - entry->eax = phys_as | (virt_as << 8); + entry->eax = phys_as | (virt_as << 8) | (g_phys_as << 16); entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8)); entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0008_EBX); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 60f21bb4c27b..b410a227c601 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -100,6 +100,8 @@ static inline u8 kvm_get_shadow_phys_bits(void) return boot_cpu_data.x86_phys_bits; } +u8 kvm_mmu_get_max_tdp_level(void); + void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 992e651540e8..db3a26eb7b75 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5322,6 +5322,11 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) return max_tdp_level; } +u8 kvm_mmu_get_max_tdp_level(void) +{ + return tdp_root_level ? tdp_root_level : max_tdp_level; +} + static union kvm_mmu_page_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, union kvm_cpu_role cpu_role) From a952d608f0bef10359449692e0fcff6608b6cd40 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 31 Jan 2024 09:23:57 +0800 Subject: [PATCH 3/6] KVM: Use vfree for memory allocated by vcalloc()/__vcalloc() commit 37b2a6510a48("KVM: use __vcalloc for very large allocations") replaced kvzalloc()/kvcalloc() with vcalloc(), but didn't replace kvfree() with vfree(). Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20240131012357.53563-1-lirongqing@baidu.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/page_track.c | 2 +- arch/x86/kvm/x86.c | 6 +++--- virt/kvm/kvm_main.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index f6448284c18e..561c331fd6ec 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -41,7 +41,7 @@ bool kvm_page_track_write_tracking_enabled(struct kvm *kvm) void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { - kvfree(slot->arch.gfn_write_track); + vfree(slot->arch.gfn_write_track); slot->arch.gfn_write_track = NULL; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 47d9f03b7778..7e654ebd9410 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12731,7 +12731,7 @@ static void memslot_rmap_free(struct kvm_memory_slot *slot) int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.rmap[i]); + vfree(slot->arch.rmap[i]); slot->arch.rmap[i] = NULL; } } @@ -12743,7 +12743,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } @@ -12835,7 +12835,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, memslot_rmap_free(slot); for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { - kvfree(slot->arch.lpage_info[i - 1]); + vfree(slot->arch.lpage_info[i - 1]); slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fb49c2a60200..711970d385f5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1020,7 +1020,7 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) if (!memslot->dirty_bitmap) return; - kvfree(memslot->dirty_bitmap); + vfree(memslot->dirty_bitmap); memslot->dirty_bitmap = NULL; } From 1d294dfaba8c35bd6d9558ae49ca36455e524cd1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 8 Apr 2024 16:15:00 -0700 Subject: [PATCH 4/6] KVM: x86: Allow, don't ignore, same-value writes to immutable MSRs When handling userspace writes to immutable feature MSRs for a vCPU that has already run, fall through into the normal code to set the MSR instead of immediately returning '0'. I.e. allow such writes, instead of ignoring such writes. This fixes a bug where KVM incorrectly allows writes to the VMX MSRs that enumerate which CR{0,4} can be set, but only if the vCPU has already run. The intent of returning '0' and thus ignoring the write, was to avoid any side effects, e.g. refreshing the PMU and thus doing weird things with perf events while the vCPU is running. That approach sounds nice in theory, but in practice it makes it all but impossible to maintain a sane ABI, e.g. all VMX MSRs return -EBUSY if the CPU is post-VMXON, and the VMX MSRs for fixed-1 CR bits are never writable, etc. As for refreshing the PMU, kvm_set_msr_common() explicitly skips the PMU refresh if MSR_IA32_PERF_CAPABILITIES is being written with the current value, specifically to avoid unwanted side effects. And if necessary, adding similar logic for other MSRs is not difficult. Fixes: 0094f62c7eaa ("KVM: x86: Disallow writes to immutable feature MSRs after KVM_RUN") Reported-by: Jim Mattson Cc: Raghavendra Rao Ananta Link: https://lore.kernel.org/r/20240408231500.1388122-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e654ebd9410..f126c65239b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2230,16 +2230,13 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) /* * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does * not support modifying the guest vCPU model on the fly, e.g. changing - * the nVMX capabilities while L2 is running is nonsensical. Ignore + * the nVMX capabilities while L2 is running is nonsensical. Allow * writes of the same value, e.g. to allow userspace to blindly stuff * all MSRs when emulating RESET. */ - if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index)) { - if (do_get_msr(vcpu, index, &val) || *data != val) - return -EINVAL; - - return 0; - } + if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(index) && + (do_get_msr(vcpu, index, &val) || *data != val)) + return -EINVAL; return kvm_set_msr_ignored_check(vcpu, index, *data, true); } From 6982b34c21cb01bfe650cabcd4bb28584c8d589a Mon Sep 17 00:00:00 2001 From: Alejandro Jimenez Date: Thu, 18 Apr 2024 02:18:22 +0000 Subject: [PATCH 5/6] KVM: x86: Only set APICV_INHIBIT_REASON_ABSENT if APICv is enabled Use the APICv enablement status to determine if APICV_INHIBIT_REASON_ABSENT needs to be set, instead of unconditionally setting the reason during initialization. Specifically, in cases where AVIC is disabled via module parameter or lack of hardware support, unconditionally setting an inhibit reason due to the absence of an in-kernel local APIC can lead to a scenario where the reason incorrectly remains set after a local APIC has been created by either KVM_CREATE_IRQCHIP or the enabling of KVM_CAP_IRQCHIP_SPLIT. This is because the helpers in charge of removing the inhibit return early if enable_apicv is not true, and therefore the bit remains set. This leads to confusion as to the cause why APICv is not active, since an incorrect reason will be reported by tracepoints and/or a debugging tool that examines the currently set inhibit reasons. Fixes: ef8b4b720368 ("KVM: ensure APICv is considered inactive if there is no APIC") Signed-off-by: Alejandro Jimenez Link: https://lore.kernel.org/r/20240418021823.1275276-2-alejandro.j.jimenez@oracle.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f126c65239b2..95a86ee871ff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9992,15 +9992,12 @@ static void set_or_clear_apicv_inhibit(unsigned long *inhibits, static void kvm_apicv_init(struct kvm *kvm) { - unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons; + enum kvm_apicv_inhibit reason = enable_apicv ? APICV_INHIBIT_REASON_ABSENT : + APICV_INHIBIT_REASON_DISABLE; + + set_or_clear_apicv_inhibit(&kvm->arch.apicv_inhibit_reasons, reason, true); init_rwsem(&kvm->arch.apicv_update_lock); - - set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true); - - if (!enable_apicv) - set_or_clear_apicv_inhibit(inhibits, - APICV_INHIBIT_REASON_DISABLE, true); } static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) From 51937f2aae186e335175dde78279aaf0cb5e72ae Mon Sep 17 00:00:00 2001 From: Alejandro Jimenez Date: Thu, 18 Apr 2024 02:18:23 +0000 Subject: [PATCH 6/6] KVM: x86: Remove VT-d mention in posted interrupt tracepoint The kvm_pi_irte_update tracepoint is called from both SVM and VMX vendor code, and while the "posted interrupt" naming is also adopted by SVM in several places, VT-d specifically refers to Intel's "Virtualization Technology for Directed I/O". Signed-off-by: Alejandro Jimenez Link: https://lore.kernel.org/r/20240418021823.1275276-3-alejandro.j.jimenez@oracle.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index c6b4b1728006..9d0b02ef307e 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1074,7 +1074,7 @@ TRACE_EVENT(kvm_smm_transition, ); /* - * Tracepoint for VT-d posted-interrupts. + * Tracepoint for VT-d posted-interrupts and AMD-Vi Guest Virtual APIC. */ TRACE_EVENT(kvm_pi_irte_update, TP_PROTO(unsigned int host_irq, unsigned int vcpu_id, @@ -1100,7 +1100,7 @@ TRACE_EVENT(kvm_pi_irte_update, __entry->set = set; ), - TP_printk("VT-d PI is %s for irq %u, vcpu %u, gsi: 0x%x, " + TP_printk("PI is %s for irq %u, vcpu %u, gsi: 0x%x, " "gvec: 0x%x, pi_desc_addr: 0x%llx", __entry->set ? "enabled and being updated" : "disabled", __entry->host_irq,