From cf64527bb33f6cec2ed50f89182fc4688d0056b6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 21 Jul 2019 13:52:18 +0200 Subject: [PATCH 1/7] KVM: nVMX: Clear pending KVM_REQ_GET_VMCS12_PAGES when leaving nested Letting this pend may cause nested_get_vmcs12_pages to run against an invalid state, corrupting the effective vmcs of L1. This was triggerable in QEMU after a guest corruption in L2, followed by a L1 reset. Signed-off-by: Jan Kiszka Reviewed-by: Liran Alon Cc: stable@vger.kernel.org Fixes: 7f7f1ba33cf2 ("KVM: x86: do not load vmcs12 pages while still in SMM") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 0f1378789bd0..4cdab4b4eff1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -220,6 +220,8 @@ static void free_nested(struct kvm_vcpu *vcpu) if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) return; + kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); + vmx->nested.vmxon = false; vmx->nested.smm.vmxon = false; free_vpid(vmx->nested.vpid02); From ec269475cba7bcdd1eb8fdf8e87f4c6c81a376fe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Jul 2019 13:31:27 +0200 Subject: [PATCH 2/7] Revert "kvm: x86: Use task structs fpu field for user" This reverts commit 240c35a3783ab9b3a0afaba0dde7291295680a6b ("kvm: x86: Use task structs fpu field for user", 2018-11-06). The commit is broken and causes QEMU's FPU state to be destroyed when KVM_RUN is preempted. Fixes: 240c35a3783a ("kvm: x86: Use task structs fpu field for user") Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 7 ++++--- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0cc5b611a113..b2f1ffb937af 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -607,15 +607,16 @@ struct kvm_vcpu_arch { /* * QEMU userspace and the guest each have their own FPU state. - * In vcpu_run, we switch between the user, maintained in the - * task_struct struct, and guest FPU contexts. While running a VCPU, - * the VCPU thread will have the guest FPU context. + * In vcpu_run, we switch between the user and guest FPU contexts. + * While running a VCPU, the VCPU thread will have the guest FPU + * context. * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The * "guest_fpu" state here contains the guest FPU context, with the * host PRKU bits. */ + struct fpu user_fpu; struct fpu *guest_fpu; u64 xcr0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 58305cf81182..cf2afdf8facf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8270,7 +8270,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(¤t->thread.fpu); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8287,7 +8287,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpregs_lock(); copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(¤t->thread.fpu.state); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); fpregs_mark_activate(); fpregs_unlock(); From e751732486eb3f159089a64d1901992b1357e7cc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 22 Jul 2019 12:26:20 +0800 Subject: [PATCH 3/7] KVM: X86: Fix fpu state crash in kvm guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The idea before commit 240c35a37 (which has just been reverted) was that we have the following FPU states: userspace (QEMU) guest --------------------------------------------------------------------------- processor vcpu->arch.guest_fpu >>> KVM_RUN: kvm_load_guest_fpu vcpu->arch.user_fpu processor >>> preempt out vcpu->arch.user_fpu current->thread.fpu >>> preempt in vcpu->arch.user_fpu processor >>> back to userspace >>> kvm_put_guest_fpu processor vcpu->arch.guest_fpu --------------------------------------------------------------------------- With the new lazy model we want to get the state back to the processor when schedule in from current->thread.fpu. Reported-by: Thomas Lambertz Reported-by: anthony Tested-by: anthony Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Thomas Lambertz Cc: anthony Cc: stable@vger.kernel.org Fixes: 5f409e20b (x86/fpu: Defer FPU state load until return to userspace) Signed-off-by: Wanpeng Li [Add a comment in front of the warning. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cf2afdf8facf..7eb56f8e2ea8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3306,6 +3306,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); @@ -7990,9 +7994,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) trace_kvm_entry(vcpu->vcpu_id); guest_enter_irqoff(); - fpregs_assert_state_consistent(); - if (test_thread_flag(TIF_NEED_FPU_LOAD)) - switch_fpu_return(); + /* The preempt notifier should have taken care of the FPU already. */ + WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD)); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); From d9a710e5fc4941944d565b013414e9fdc66242b5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 22 Jul 2019 12:26:21 +0800 Subject: [PATCH 4/7] KVM: X86: Dynamically allocate user_fpu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After reverting commit 240c35a3783a (kvm: x86: Use task structs fpu field for user), struct kvm_vcpu is 19456 bytes on my server, PAGE_ALLOC_COSTLY_ORDER(3) is the order at which allocations are deemed costly to service. In serveless scenario, one host can service hundreds/thoudands firecracker/kata-container instances, howerver, new instance will fail to launch after memory is too fragmented to allocate kvm_vcpu struct on host, this was observed in some cloud provider product environments. This patch dynamically allocates user_fpu, kvm_vcpu is 15168 bytes now on my Skylake server. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 13 ++++++++++++- arch/x86/kvm/vmx/vmx.c | 13 ++++++++++++- arch/x86/kvm/x86.c | 4 ++-- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b2f1ffb937af..e74f0711eaaf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -616,7 +616,7 @@ struct kvm_vcpu_arch { * "guest_fpu" state here contains the guest FPU context, with the * host PRKU bits. */ - struct fpu user_fpu; + struct fpu *user_fpu; struct fpu *guest_fpu; u64 xcr0; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 19f69df96758..7eafc6907861 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2143,12 +2143,20 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!svm->vcpu.arch.user_fpu) { + printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); + err = -ENOMEM; + goto free_partial_svm; + } + svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!svm->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; - goto free_partial_svm; + goto free_user_fpu; } err = kvm_vcpu_init(&svm->vcpu, kvm, id); @@ -2211,6 +2219,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) kvm_vcpu_uninit(&svm->vcpu); free_svm: kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); free_partial_svm: kmem_cache_free(kvm_vcpu_cache, svm); out: @@ -2241,6 +2251,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); + kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu); kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, svm); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a279447eb75b..074385c86c09 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6598,6 +6598,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); kmem_cache_free(kvm_vcpu_cache, vmx); } @@ -6613,12 +6614,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); + vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vmx->vcpu.arch.user_fpu) { + printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n"); + err = -ENOMEM; + goto free_partial_vcpu; + } + vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx->vcpu.arch.guest_fpu) { printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n"); err = -ENOMEM; - goto free_partial_vcpu; + goto free_user_fpu; } vmx->vpid = allocate_vpid(); @@ -6721,6 +6730,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) free_vcpu: free_vpid(vmx->vpid); kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu); free_partial_vcpu: kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7eb56f8e2ea8..01e18caac825 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8273,7 +8273,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { fpregs_lock(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + copy_fpregs_to_fpstate(vcpu->arch.user_fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8290,7 +8290,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpregs_lock(); copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); fpregs_mark_activate(); fpregs_unlock(); From c6bf2ae931adbd3e10967e12142856439a211813 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 21 Jul 2019 16:01:36 +0200 Subject: [PATCH 5/7] KVM: nVMX: Set cached_vmcs12 and cached_shadow_vmcs12 NULL after free Shall help finding use-after-free bugs earlier. Suggested-by: Liran Alon Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4cdab4b4eff1..ced9fba32598 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -234,7 +234,9 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); + vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); + vmx->nested.cached_shadow_vmcs12 = NULL; /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); From 2f5947dfcaecb99f2dd559156eecbeb7b95e4c02 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Jul 2019 09:24:49 +0200 Subject: [PATCH 6/7] Documentation: move Documentation/virtual to Documentation/virt Renaming docs seems to be en vogue at the moment, so fix on of the grossly misnamed directories. We usually never use "virtual" as a shortcut for virtualization in the kernel, but always virt, as seen in the virt/ top-level directory. Fix up the documentation to match that. Fixes: ed16648eb5b8 ("Move kvm, uml, and lguest subdirectories under a common "virtual" directory, I.E:") Signed-off-by: Christoph Hellwig Signed-off-by: Paolo Bonzini --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/{virtual => virt}/index.rst | 0 .../{virtual => virt}/kvm/amd-memory-encryption.rst | 0 Documentation/{virtual => virt}/kvm/api.txt | 2 +- Documentation/{virtual => virt}/kvm/arm/hyp-abi.txt | 0 Documentation/{virtual => virt}/kvm/arm/psci.txt | 0 Documentation/{virtual => virt}/kvm/cpuid.rst | 0 Documentation/{virtual => virt}/kvm/devices/README | 0 .../{virtual => virt}/kvm/devices/arm-vgic-its.txt | 0 Documentation/{virtual => virt}/kvm/devices/arm-vgic-v3.txt | 0 Documentation/{virtual => virt}/kvm/devices/arm-vgic.txt | 0 Documentation/{virtual => virt}/kvm/devices/mpic.txt | 0 Documentation/{virtual => virt}/kvm/devices/s390_flic.txt | 0 Documentation/{virtual => virt}/kvm/devices/vcpu.txt | 0 Documentation/{virtual => virt}/kvm/devices/vfio.txt | 0 Documentation/{virtual => virt}/kvm/devices/vm.txt | 0 Documentation/{virtual => virt}/kvm/devices/xics.txt | 0 Documentation/{virtual => virt}/kvm/devices/xive.txt | 0 Documentation/{virtual => virt}/kvm/halt-polling.txt | 0 Documentation/{virtual => virt}/kvm/hypercalls.txt | 4 ++-- Documentation/{virtual => virt}/kvm/index.rst | 0 Documentation/{virtual => virt}/kvm/locking.txt | 0 Documentation/{virtual => virt}/kvm/mmu.txt | 2 +- Documentation/{virtual => virt}/kvm/msr.txt | 0 Documentation/{virtual => virt}/kvm/nested-vmx.txt | 0 Documentation/{virtual => virt}/kvm/ppc-pv.txt | 0 Documentation/{virtual => virt}/kvm/review-checklist.txt | 2 +- Documentation/{virtual => virt}/kvm/s390-diag.txt | 0 Documentation/{virtual => virt}/kvm/timekeeping.txt | 0 Documentation/{virtual => virt}/kvm/vcpu-requests.rst | 0 Documentation/{virtual => virt}/paravirt_ops.rst | 0 Documentation/{virtual => virt}/uml/UserModeLinux-HOWTO.txt | 0 MAINTAINERS | 6 +++--- arch/powerpc/include/uapi/asm/kvm_para.h | 2 +- arch/x86/kvm/mmu.c | 2 +- include/uapi/linux/kvm.h | 4 ++-- tools/include/uapi/linux/kvm.h | 4 ++-- virt/kvm/arm/arm.c | 2 +- virt/kvm/arm/vgic/vgic-mmio-v3.c | 2 +- virt/kvm/arm/vgic/vgic.h | 4 ++-- 40 files changed, 19 insertions(+), 19 deletions(-) rename Documentation/{virtual => virt}/index.rst (100%) rename Documentation/{virtual => virt}/kvm/amd-memory-encryption.rst (100%) rename Documentation/{virtual => virt}/kvm/api.txt (99%) rename Documentation/{virtual => virt}/kvm/arm/hyp-abi.txt (100%) rename Documentation/{virtual => virt}/kvm/arm/psci.txt (100%) rename Documentation/{virtual => virt}/kvm/cpuid.rst (100%) rename Documentation/{virtual => virt}/kvm/devices/README (100%) rename Documentation/{virtual => virt}/kvm/devices/arm-vgic-its.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/arm-vgic-v3.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/arm-vgic.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/mpic.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/s390_flic.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/vcpu.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/vfio.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/vm.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/xics.txt (100%) rename Documentation/{virtual => virt}/kvm/devices/xive.txt (100%) rename Documentation/{virtual => virt}/kvm/halt-polling.txt (100%) rename Documentation/{virtual => virt}/kvm/hypercalls.txt (97%) rename Documentation/{virtual => virt}/kvm/index.rst (100%) rename Documentation/{virtual => virt}/kvm/locking.txt (100%) rename Documentation/{virtual => virt}/kvm/mmu.txt (99%) rename Documentation/{virtual => virt}/kvm/msr.txt (100%) rename Documentation/{virtual => virt}/kvm/nested-vmx.txt (100%) rename Documentation/{virtual => virt}/kvm/ppc-pv.txt (100%) rename Documentation/{virtual => virt}/kvm/review-checklist.txt (95%) rename Documentation/{virtual => virt}/kvm/s390-diag.txt (100%) rename Documentation/{virtual => virt}/kvm/timekeeping.txt (100%) rename Documentation/{virtual => virt}/kvm/vcpu-requests.rst (100%) rename Documentation/{virtual => virt}/paravirt_ops.rst (100%) rename Documentation/{virtual => virt}/uml/UserModeLinux-HOWTO.txt (100%) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 099c5a4be95b..8a8880cec34b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2532,7 +2532,7 @@ mem_encrypt=on: Activate SME mem_encrypt=off: Do not activate SME - Refer to Documentation/virtual/kvm/amd-memory-encryption.rst + Refer to Documentation/virt/kvm/amd-memory-encryption.rst for details on when memory encryption can be activated. mem_sleep_default= [SUSPEND] Default system suspend mode: diff --git a/Documentation/virtual/index.rst b/Documentation/virt/index.rst similarity index 100% rename from Documentation/virtual/index.rst rename to Documentation/virt/index.rst diff --git a/Documentation/virtual/kvm/amd-memory-encryption.rst b/Documentation/virt/kvm/amd-memory-encryption.rst similarity index 100% rename from Documentation/virtual/kvm/amd-memory-encryption.rst rename to Documentation/virt/kvm/amd-memory-encryption.rst diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virt/kvm/api.txt similarity index 99% rename from Documentation/virtual/kvm/api.txt rename to Documentation/virt/kvm/api.txt index e54a3f51ddc5..2d067767b617 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virt/kvm/api.txt @@ -3781,7 +3781,7 @@ encrypted VMs. Currently, this ioctl is used for issuing Secure Encrypted Virtualization (SEV) commands on AMD Processors. The SEV commands are defined in -Documentation/virtual/kvm/amd-memory-encryption.rst. +Documentation/virt/kvm/amd-memory-encryption.rst. 4.111 KVM_MEMORY_ENCRYPT_REG_REGION diff --git a/Documentation/virtual/kvm/arm/hyp-abi.txt b/Documentation/virt/kvm/arm/hyp-abi.txt similarity index 100% rename from Documentation/virtual/kvm/arm/hyp-abi.txt rename to Documentation/virt/kvm/arm/hyp-abi.txt diff --git a/Documentation/virtual/kvm/arm/psci.txt b/Documentation/virt/kvm/arm/psci.txt similarity index 100% rename from Documentation/virtual/kvm/arm/psci.txt rename to Documentation/virt/kvm/arm/psci.txt diff --git a/Documentation/virtual/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst similarity index 100% rename from Documentation/virtual/kvm/cpuid.rst rename to Documentation/virt/kvm/cpuid.rst diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virt/kvm/devices/README similarity index 100% rename from Documentation/virtual/kvm/devices/README rename to Documentation/virt/kvm/devices/README diff --git a/Documentation/virtual/kvm/devices/arm-vgic-its.txt b/Documentation/virt/kvm/devices/arm-vgic-its.txt similarity index 100% rename from Documentation/virtual/kvm/devices/arm-vgic-its.txt rename to Documentation/virt/kvm/devices/arm-vgic-its.txt diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt b/Documentation/virt/kvm/devices/arm-vgic-v3.txt similarity index 100% rename from Documentation/virtual/kvm/devices/arm-vgic-v3.txt rename to Documentation/virt/kvm/devices/arm-vgic-v3.txt diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virt/kvm/devices/arm-vgic.txt similarity index 100% rename from Documentation/virtual/kvm/devices/arm-vgic.txt rename to Documentation/virt/kvm/devices/arm-vgic.txt diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virt/kvm/devices/mpic.txt similarity index 100% rename from Documentation/virtual/kvm/devices/mpic.txt rename to Documentation/virt/kvm/devices/mpic.txt diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virt/kvm/devices/s390_flic.txt similarity index 100% rename from Documentation/virtual/kvm/devices/s390_flic.txt rename to Documentation/virt/kvm/devices/s390_flic.txt diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virt/kvm/devices/vcpu.txt similarity index 100% rename from Documentation/virtual/kvm/devices/vcpu.txt rename to Documentation/virt/kvm/devices/vcpu.txt diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virt/kvm/devices/vfio.txt similarity index 100% rename from Documentation/virtual/kvm/devices/vfio.txt rename to Documentation/virt/kvm/devices/vfio.txt diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virt/kvm/devices/vm.txt similarity index 100% rename from Documentation/virtual/kvm/devices/vm.txt rename to Documentation/virt/kvm/devices/vm.txt diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virt/kvm/devices/xics.txt similarity index 100% rename from Documentation/virtual/kvm/devices/xics.txt rename to Documentation/virt/kvm/devices/xics.txt diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virt/kvm/devices/xive.txt similarity index 100% rename from Documentation/virtual/kvm/devices/xive.txt rename to Documentation/virt/kvm/devices/xive.txt diff --git a/Documentation/virtual/kvm/halt-polling.txt b/Documentation/virt/kvm/halt-polling.txt similarity index 100% rename from Documentation/virtual/kvm/halt-polling.txt rename to Documentation/virt/kvm/halt-polling.txt diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virt/kvm/hypercalls.txt similarity index 97% rename from Documentation/virtual/kvm/hypercalls.txt rename to Documentation/virt/kvm/hypercalls.txt index da210651f714..5f6d291bd004 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virt/kvm/hypercalls.txt @@ -18,7 +18,7 @@ S390: number in R1. For further information on the S390 diagnose call as supported by KVM, - refer to Documentation/virtual/kvm/s390-diag.txt. + refer to Documentation/virt/kvm/s390-diag.txt. PowerPC: It uses R3-R10 and hypercall number in R11. R4-R11 are used as output registers. @@ -26,7 +26,7 @@ S390: KVM hypercalls uses 4 byte opcode, that are patched with 'hypercall-instructions' property inside the device tree's /hypervisor node. - For more information refer to Documentation/virtual/kvm/ppc-pv.txt + For more information refer to Documentation/virt/kvm/ppc-pv.txt MIPS: KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall diff --git a/Documentation/virtual/kvm/index.rst b/Documentation/virt/kvm/index.rst similarity index 100% rename from Documentation/virtual/kvm/index.rst rename to Documentation/virt/kvm/index.rst diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virt/kvm/locking.txt similarity index 100% rename from Documentation/virtual/kvm/locking.txt rename to Documentation/virt/kvm/locking.txt diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virt/kvm/mmu.txt similarity index 99% rename from Documentation/virtual/kvm/mmu.txt rename to Documentation/virt/kvm/mmu.txt index 2efe0efc516e..1b9880dfba0a 100644 --- a/Documentation/virtual/kvm/mmu.txt +++ b/Documentation/virt/kvm/mmu.txt @@ -298,7 +298,7 @@ Handling a page fault is performed as follows: vcpu->arch.mmio_gfn, and call the emulator - If both P bit and R/W bit of error code are set, this could possibly be handled as a "fast page fault" (fixed without taking the MMU lock). See - the description in Documentation/virtual/kvm/locking.txt. + the description in Documentation/virt/kvm/locking.txt. - if needed, walk the guest page tables to determine the guest translation (gva->gpa or ngpa->gpa) - if permissions are insufficient, reflect the fault back to the guest diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virt/kvm/msr.txt similarity index 100% rename from Documentation/virtual/kvm/msr.txt rename to Documentation/virt/kvm/msr.txt diff --git a/Documentation/virtual/kvm/nested-vmx.txt b/Documentation/virt/kvm/nested-vmx.txt similarity index 100% rename from Documentation/virtual/kvm/nested-vmx.txt rename to Documentation/virt/kvm/nested-vmx.txt diff --git a/Documentation/virtual/kvm/ppc-pv.txt b/Documentation/virt/kvm/ppc-pv.txt similarity index 100% rename from Documentation/virtual/kvm/ppc-pv.txt rename to Documentation/virt/kvm/ppc-pv.txt diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virt/kvm/review-checklist.txt similarity index 95% rename from Documentation/virtual/kvm/review-checklist.txt rename to Documentation/virt/kvm/review-checklist.txt index a83b27635fdd..499af499e296 100644 --- a/Documentation/virtual/kvm/review-checklist.txt +++ b/Documentation/virt/kvm/review-checklist.txt @@ -7,7 +7,7 @@ Review checklist for kvm patches 2. Patches should be against kvm.git master branch. 3. If the patch introduces or modifies a new userspace API: - - the API must be documented in Documentation/virtual/kvm/api.txt + - the API must be documented in Documentation/virt/kvm/api.txt - the API must be discoverable using KVM_CHECK_EXTENSION 4. New state must include support for save/restore. diff --git a/Documentation/virtual/kvm/s390-diag.txt b/Documentation/virt/kvm/s390-diag.txt similarity index 100% rename from Documentation/virtual/kvm/s390-diag.txt rename to Documentation/virt/kvm/s390-diag.txt diff --git a/Documentation/virtual/kvm/timekeeping.txt b/Documentation/virt/kvm/timekeeping.txt similarity index 100% rename from Documentation/virtual/kvm/timekeeping.txt rename to Documentation/virt/kvm/timekeeping.txt diff --git a/Documentation/virtual/kvm/vcpu-requests.rst b/Documentation/virt/kvm/vcpu-requests.rst similarity index 100% rename from Documentation/virtual/kvm/vcpu-requests.rst rename to Documentation/virt/kvm/vcpu-requests.rst diff --git a/Documentation/virtual/paravirt_ops.rst b/Documentation/virt/paravirt_ops.rst similarity index 100% rename from Documentation/virtual/paravirt_ops.rst rename to Documentation/virt/paravirt_ops.rst diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virt/uml/UserModeLinux-HOWTO.txt similarity index 100% rename from Documentation/virtual/uml/UserModeLinux-HOWTO.txt rename to Documentation/virt/uml/UserModeLinux-HOWTO.txt diff --git a/MAINTAINERS b/MAINTAINERS index debbb7b97c98..1aec93695040 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8727,7 +8727,7 @@ L: kvm@vger.kernel.org W: http://www.linux-kvm.org T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git S: Supported -F: Documentation/virtual/kvm/ +F: Documentation/virt/kvm/ F: include/trace/events/kvm.h F: include/uapi/asm-generic/kvm* F: include/uapi/linux/kvm* @@ -12054,7 +12054,7 @@ M: Juergen Gross M: Alok Kataria L: virtualization@lists.linux-foundation.org S: Supported -F: Documentation/virtual/paravirt_ops.txt +F: Documentation/virt/paravirt_ops.txt F: arch/*/kernel/paravirt* F: arch/*/include/asm/paravirt*.h F: include/linux/hypervisor.h @@ -16745,7 +16745,7 @@ W: http://user-mode-linux.sourceforge.net Q: https://patchwork.ozlabs.org/project/linux-um/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/rw/uml.git S: Maintained -F: Documentation/virtual/uml/ +F: Documentation/virt/uml/ F: arch/um/ F: arch/x86/um/ F: fs/hostfs/ diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h index 01555c6ae0f5..be48c2215fa2 100644 --- a/arch/powerpc/include/uapi/asm/kvm_para.h +++ b/arch/powerpc/include/uapi/asm/kvm_para.h @@ -31,7 +31,7 @@ * Struct fields are always 32 or 64 bit aligned, depending on them being 32 * or 64 bit wide respectively. * - * See Documentation/virtual/kvm/ppc-pv.txt + * See Documentation/virt/kvm/ppc-pv.txt */ struct kvm_vcpu_arch_shared { __u64 scratch1; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8f72526e2f68..24843cf49579 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3466,7 +3466,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, /* * Currently, fast page fault only works for direct mapping * since the gfn is not stable for indirect shadow page. See - * Documentation/virtual/kvm/locking.txt to get more detail. + * Documentation/virt/kvm/locking.txt to get more detail. */ fault_handled = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a7c19540ce21..5e3f12d5359e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virtual/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.txt */ union { __u32 irq; @@ -1086,7 +1086,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virtual/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.txt. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c2152f3dd02d..e7c67be7c15f 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -116,7 +116,7 @@ struct kvm_irq_level { * ACPI gsi notion of irq. * For IA-64 (APIC model) IOAPIC0: irq 0-23; IOAPIC1: irq 24-47.. * For X86 (standard AT mode) PIC0/1: irq 0-15. IOAPIC0: 0-23.. - * For ARM: See Documentation/virtual/kvm/api.txt + * For ARM: See Documentation/virt/kvm/api.txt */ union { __u32 irq; @@ -1085,7 +1085,7 @@ struct kvm_xen_hvm_config { * * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies * the irqfd to operate in resampling mode for level triggered interrupt - * emulation. See Documentation/virtual/kvm/api.txt. + * emulation. See Documentation/virt/kvm/api.txt. */ #define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f645c0fbf7ec..acc43242a310 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -727,7 +727,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and - * Documentation/virtual/kvm/vcpu-requests.rst + * Documentation/virt/kvm/vcpu-requests.rst */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 936962abc38d..c45e2d7e942f 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -250,7 +250,7 @@ static unsigned long vgic_v3_uaccess_read_pending(struct kvm_vcpu *vcpu, * pending state of interrupt is latched in pending_latch variable. * Userspace will save and restore pending state and line_level * separately. - * Refer to Documentation/virtual/kvm/devices/arm-vgic-v3.txt + * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.txt * for handling of ISPENDR and ICPENDR. */ for (i = 0; i < len * 8; i++) { diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 57205beaa981..3b7525deec80 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -42,7 +42,7 @@ VGIC_AFFINITY_LEVEL(val, 3)) /* - * As per Documentation/virtual/kvm/devices/arm-vgic-v3.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-v3.txt, * below macros are defined for CPUREG encoding. */ #define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK 0x000000000000c000 @@ -63,7 +63,7 @@ KVM_REG_ARM_VGIC_SYSREG_OP2_MASK) /* - * As per Documentation/virtual/kvm/devices/arm-vgic-its.txt, + * As per Documentation/virt/kvm/devices/arm-vgic-its.txt, * below macros are defined for ITS table entry encoding. */ #define KVM_ITS_CTE_VALID_SHIFT 63 From 266e85a5ec9100dcd9ae03601453bbc96fefee5d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 24 Jul 2019 17:43:13 +0800 Subject: [PATCH 7/7] KVM: X86: Boost queue head vCPU to mitigate lock waiter preemption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 11752adb (locking/pvqspinlock: Implement hybrid PV queued/unfair locks) introduces hybrid PV queued/unfair locks - queued mode (no starvation) - unfair mode (good performance on not heavily contended lock) The lock waiter goes into the unfair mode especially in VMs with over-commit vCPUs since increaing over-commitment increase the likehood that the queue head vCPU may have been preempted and not actively spinning. However, reschedule queue head vCPU timely to acquire the lock still can get better performance than just depending on lock stealing in over-subscribe scenario. Testing on 80 HT 2 socket Xeon Skylake server, with 80 vCPUs VM 80GB RAM: ebizzy -M vanilla boosting improved 1VM 23520 25040 6% 2VM 8000 13600 70% 3VM 3100 5400 74% The lock holder vCPU yields to the queue head vCPU when unlock, to boost queue head vCPU which is involuntary preemption or the one which is voluntary halt due to fail to acquire the lock after a short spin in the guest. Cc: Waiman Long Cc: Peter Zijlstra Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01e18caac825..c6d951cbd76c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7206,7 +7206,7 @@ static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) rcu_read_unlock(); - if (target) + if (target && READ_ONCE(target->ready)) kvm_vcpu_yield_to(target); } @@ -7246,6 +7246,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; case KVM_HC_KICK_CPU: kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_sched_yield(vcpu->kvm, a1); ret = 0; break; #ifdef CONFIG_X86_64