From 898f949fb7bc1210b79f06a04d1956d106a32633 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Mon, 7 Mar 2016 23:50:36 +0700 Subject: [PATCH 01/27] KVM: arm/arm64: disable preemption when calling smp_call_function_many Preemption must be disabled when calling smp_call_function_many Reported-by: bartosz.wawrzyniak@tieto.com Signed-off-by: Eric Auger Signed-off-by: Christoffer Dall --- arch/arm/kvm/arm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 3e0fb66d8e05..6accd66d26f0 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -373,7 +373,9 @@ static void exit_vm_noop(void *info) void force_vm_exit(const cpumask_t *mask) { + preempt_disable(); smp_call_function_many(mask, exit_vm_noop, NULL, true); + preempt_enable(); } /** From 2510ffe17f9707eb96cf286cf5d11ad372ff679f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Mar 2016 17:25:59 +0000 Subject: [PATCH 02/27] arm64: KVM: Turn kvm_ksym_ref into a NOP on VHE When running with VHE, there is no need to translate kernel pointers to the EL2 memory space, since we're already there (and we have a much saner memory map to start with). Unfortunately, kvm_ksym_ref is getting in the way, and the first call into the "hypervisor" section is going to end up in fireworks, since we're now branching into nowhereland. Meh. A potential solution is to test if VHE is engaged or not, and only perform the translation in the negative case. With this in place, VHE is able to run again. Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_asm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 226f49d69ea9..eb7490d232a0 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -26,7 +26,13 @@ #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) -#define kvm_ksym_ref(sym) phys_to_virt((u64)&sym - kimage_voffset) +#define kvm_ksym_ref(sym) \ + ({ \ + void *val = &sym; \ + if (!is_kernel_in_hyp_mode()) \ + val = phys_to_virt((u64)&sym - kimage_voffset); \ + val; \ + }) #ifndef __ASSEMBLY__ struct kvm; From a6cdf1c08cbfe0818a3d8042844d75bf74fd82bd Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 21 Mar 2016 11:00:17 +0000 Subject: [PATCH 03/27] kvm: arm64: Disable compiler instrumentation for hypervisor code With the recent rewrite of the arm64 KVM hypervisor code in C, enabling certain options like KASAN would allow the compiler to generate memory accesses or function calls to addresses not mapped at EL2. This patch disables the compiler instrumentation on the arm64 hypervisor code for gcov-based profiling (GCOV_KERNEL), undefined behaviour sanity checker (UBSAN) and kernel address sanitizer (KASAN). Signed-off-by: Catalin Marinas Cc: Christoffer Dall Cc: Marc Zyngier Cc: Paolo Bonzini Cc: # 4.5+ Signed-off-by: Christoffer Dall --- arch/arm64/kvm/hyp/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index b6a8fc5ad1af..778d0effa2af 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -16,3 +16,7 @@ obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o obj-$(CONFIG_KVM_ARM_HOST) += tlb.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o + +GCOV_PROFILE := n +KASAN_SANITIZE := n +UBSAN_SANITIZE := n From 2849eb4f99d54925c543db12917127f88b3c38ff Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Mar 2016 16:53:29 +0100 Subject: [PATCH 04/27] KVM: VMX: avoid guest hang on invalid invept instruction A guest executing an invalid invept instruction would hang because the instruction pointer was not updated. Cc: stable@vger.kernel.org Fixes: bfd0a56b90005f8c8a004baf407ad90045c2b11e Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 75173efccac5..01f515873637 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7399,6 +7399,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } From f6870ee9e53430f2a318ccf0dd5e66bb46194e43 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Mar 2016 16:53:42 +0100 Subject: [PATCH 05/27] KVM: VMX: avoid guest hang on invalid invvpid instruction A guest executing an invalid invvpid instruction would hang because the instruction pointer was not updated. Reported-by: jmontleo@redhat.com Tested-by: jmontleo@redhat.com Cc: stable@vger.kernel.org Fixes: 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85 Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 01f515873637..0ce009cc23f1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7458,6 +7458,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) if (!(types & (1UL << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + skip_emulated_instruction(vcpu); return 1; } From ef697a712a6165aea7779c295604b099e8bfae2e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Mar 2016 16:58:38 +0100 Subject: [PATCH 06/27] KVM: VMX: fix nested vpid for old KVM guests Old KVM guests invoke single-context invvpid without actually checking whether it is supported. This was fixed by commit 518c8ae ("KVM: VMX: Make sure single type invvpid is supported before issuing invvpid instruction", 2010-08-01) and the patch after, but pre-2.6.36 kernels lack it including RHEL 6. Reported-by: jmontleo@redhat.com Tested-by: jmontleo@redhat.com Cc: stable@vger.kernel.org Fixes: 99b83ac893b84ed1a62ad6d1f2b6cc32026b9e85 Reviewed-by: David Matlack Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0ce009cc23f1..161230016dfa 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2712,8 +2712,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) } else vmx->nested.nested_vmx_ept_caps = 0; + /* + * Old versions of KVM use the single-context version without + * checking for support, so declare that it is supported even + * though it is treated as global context. The alternative is + * not failing the single-context invvpid, and it is worse. + */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -7475,12 +7482,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_SINGLE_CONTEXT: + /* + * Old versions of KVM use the single-context version so we + * have to support it; just treat it the same as all-context. + */ case VMX_VPID_EXTENT_ALL_CONTEXT: __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); nested_vmx_succeed(vcpu); break; default: - /* Trap single context invalidation invvpid calls */ + /* Trap individual address invalidation invvpid calls */ BUG_ON(1); break; } From 31217db720186edc7ae3c4354e5fa5848bb28358 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 18 Mar 2016 13:50:42 +1100 Subject: [PATCH 07/27] KVM: PPC: Create a virtual-mode only TCE table handlers Upcoming in-kernel VFIO acceleration needs different handling in real and virtual modes which makes it hard to support both modes in the same handler. This creates a copy of kvmppc_rm_h_stuff_tce and kvmppc_rm_h_put_tce in addition to the existing kvmppc_rm_h_put_tce_indirect. This also fixes linker breakage when only PR KVM was selected (leaving HV KVM off): the kvmppc_h_put_tce/kvmppc_h_stuff_tce functions would not compile at all and the linked would fail. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_64_vio.c | 52 +++++++++++++++++++++++++ arch/powerpc/kvm/book3s_64_vio_hv.c | 8 ++-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 +- 3 files changed, 57 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 82970042295e..18cf6d1f8174 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -209,6 +209,32 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return ret; } +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce) +{ + struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + long ret; + + /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ + /* liobn, ioba, tce); */ + + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, 1); + if (ret != H_SUCCESS) + return ret; + + ret = kvmppc_tce_validate(stt, tce); + if (ret != H_SUCCESS) + return ret; + + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); + long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce_list, unsigned long npages) @@ -264,3 +290,29 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect); + +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages) +{ + struct kvmppc_spapr_tce_table *stt; + long i, ret; + + stt = kvmppc_find_table(vcpu, liobn); + if (!stt) + return H_TOO_HARD; + + ret = kvmppc_ioba_validate(stt, ioba, npages); + if (ret != H_SUCCESS) + return ret; + + /* Check permission bits only to allow userspace poison TCE for debug */ + if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) + return H_PARAMETER; + + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) + kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index f88b859af53b..d461c440889a 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -180,8 +180,8 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, - unsigned long ioba, unsigned long tce) +long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce) { struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); long ret; @@ -204,7 +204,6 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, unsigned long ua, unsigned long *phpa) @@ -296,7 +295,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, return ret; } -long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, +long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce_value, unsigned long npages) { @@ -320,7 +319,6 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, return H_SUCCESS; } -EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce); long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 62ea3c6acdee..e571ad277398 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1942,7 +1942,7 @@ hcall_real_table: .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table .long DOTSYM(kvmppc_h_protect) - hcall_real_table .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table - .long DOTSYM(kvmppc_h_put_tce) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table .long 0 /* 0x24 - H_SET_SPRG0 */ .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table .long 0 /* 0x2c */ @@ -2020,7 +2020,7 @@ hcall_real_table: .long 0 /* 0x12c */ .long 0 /* 0x130 */ .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table - .long DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table .long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table .long 0 /* 0x140 */ .long 0 /* 0x144 */ From e9ad4ec8379ad1ba6f68b8ca1c26b50b5ae0a327 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Mar 2016 10:15:25 +0100 Subject: [PATCH 08/27] KVM: fix spin_lock_init order on x86 Moving the initialization earlier is needed in 4.6 because kvm_arch_init_vm is now using mmu_lock, causing lockdep to complain: [ 284.440294] INFO: trying to register non-static key. [ 284.445259] the code is fine but needs lockdep annotation. [ 284.450736] turning off the locking correctness validator. ... [ 284.528318] [] lock_acquire+0xd3/0x240 [ 284.533733] [] ? kvm_page_track_register_notifier+0x20/0x60 [kvm] [ 284.541467] [] _raw_spin_lock+0x41/0x80 [ 284.546960] [] ? kvm_page_track_register_notifier+0x20/0x60 [kvm] [ 284.554707] [] kvm_page_track_register_notifier+0x20/0x60 [kvm] [ 284.562281] [] kvm_mmu_init_vm+0x20/0x30 [kvm] [ 284.568381] [] kvm_arch_init_vm+0x1ea/0x200 [kvm] [ 284.574740] [] kvm_dev_ioctl+0xbf/0x4d0 [kvm] However, it also helps fixing a preexisting problem, which is why this patch is also good for stable kernels: kvm_create_vm was incrementing current->mm->mm_count but not decrementing it at the out_err label (in case kvm_init_mmu_notifier failed). The new initialization order makes it possible to add the required mmdrop without adding a new error label. Cc: stable@vger.kernel.org Reported-by: Borislav Petkov Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 99ee4b1ce2db..db021c383aa7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -536,6 +536,16 @@ static struct kvm *kvm_create_vm(unsigned long type) if (!kvm) return ERR_PTR(-ENOMEM); + spin_lock_init(&kvm->mmu_lock); + atomic_inc(¤t->mm->mm_count); + kvm->mm = current->mm; + kvm_eventfd_init(kvm); + mutex_init(&kvm->lock); + mutex_init(&kvm->irq_lock); + mutex_init(&kvm->slots_lock); + atomic_set(&kvm->users_count, 1); + INIT_LIST_HEAD(&kvm->devices); + r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_disable; @@ -568,16 +578,6 @@ static struct kvm *kvm_create_vm(unsigned long type) goto out_err; } - spin_lock_init(&kvm->mmu_lock); - kvm->mm = current->mm; - atomic_inc(&kvm->mm->mm_count); - kvm_eventfd_init(kvm); - mutex_init(&kvm->lock); - mutex_init(&kvm->irq_lock); - mutex_init(&kvm->slots_lock); - atomic_set(&kvm->users_count, 1); - INIT_LIST_HEAD(&kvm->devices); - r = kvm_init_mmu_notifier(kvm); if (r) goto out_err; @@ -602,6 +602,7 @@ static struct kvm *kvm_create_vm(unsigned long type) for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) kvm_free_memslots(kvm, kvm->memslots[i]); kvm_arch_free_vm(kvm); + mmdrop(current->mm); return ERR_PTR(r); } From f13577e8aa863cf3b4f17541bc74f8e9b0b40c90 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 8 Mar 2016 10:08:16 +0100 Subject: [PATCH 09/27] KVM: MMU: return page fault error code from permission_fault This will help in the implementation of PKRU, where the PK bit of the page fault error code cannot be computed in advance (unlike I/D, R/W and U/S). Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.h | 15 ++++++++++----- arch/x86/kvm/paging_tmpl.h | 5 ++--- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 58fe98a0a526..8b2b3dfca1ae 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -141,11 +141,15 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) } /* - * Will a fault with a given page-fault error code (pfec) cause a permission - * fault with the given access (in ACC_* format)? + * Check if a given access (described through the I/D, W/R and U/S bits of a + * page fault error code pfec) causes a permission fault with the given PTE + * access rights (in ACC_* format). + * + * Return zero if the access does not fault; return the page fault error code + * if the access faults. */ -static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - unsigned pte_access, unsigned pfec) +static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + unsigned pte_access, unsigned pfec) { int cpl = kvm_x86_ops->get_cpl(vcpu); unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -169,7 +173,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, WARN_ON(pfec & PFERR_RSVD_MASK); - return (mmu->permissions[index] >> pte_access) & 1; + pfec |= PFERR_PRESENT_MASK; + return -((mmu->permissions[index] >> pte_access) & 1) & pfec; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e159a8185ad9..45ed3c38d20e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -359,10 +359,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) { - errcode |= PFERR_PRESENT_MASK; + errcode = permission_fault(vcpu, mmu, pte_access, access); + if (unlikely(errcode)) goto error; - } gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; From e0b18ef7189075676ac432954d7920eaa30d8e3e Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Tue, 22 Mar 2016 16:51:14 +0800 Subject: [PATCH 10/27] KVM: x86: remove magic number with enum cpuid_leafs This patch removes magic number with enum cpuid_leafs. Signed-off-by: Huaitong Han Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0029644bf09c..00a69ecdd401 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -305,7 +305,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; /* cpuid 1.edx */ - const u32 kvm_supported_word0_x86_features = + const u32 kvm_cpuid_1_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | @@ -315,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | 0 /* HTT, TM, Reserved, PBE */; /* cpuid 0x80000001.edx */ - const u32 kvm_supported_word1_x86_features = + const u32 kvm_cpuid_8000_0001_edx_x86_features = F(FPU) | F(VME) | F(DE) | F(PSE) | F(TSC) | F(MSR) | F(PAE) | F(MCE) | F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | @@ -325,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ - const u32 kvm_supported_word4_x86_features = + const u32 kvm_cpuid_1_ecx_x86_features = /* NOTE: MONITOR (and MWAIT) are emulated as NOP, * but *not* advertised to guests via CPUID ! */ F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | @@ -337,27 +337,27 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ - const u32 kvm_supported_word6_x86_features = + const u32 kvm_cpuid_8000_0001_ecx_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ - const u32 kvm_supported_word5_x86_features = + const u32 kvm_cpuid_C000_0001_edx_x86_features = F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | F(PMM) | F(PMM_EN); /* cpuid 7.0.ebx */ - const u32 kvm_supported_word9_x86_features = + const u32 kvm_cpuid_7_0_ebx_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT); /* cpuid 0xD.1.eax */ - const u32 kvm_supported_word10_x86_features = + const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; /* all calls to cpuid_count() should be made on the same cpu */ @@ -376,10 +376,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, (u32)0xd); break; case 1: - entry->edx &= kvm_supported_word0_x86_features; - cpuid_mask(&entry->edx, 0); - entry->ecx &= kvm_supported_word4_x86_features; - cpuid_mask(&entry->ecx, 4); + entry->edx &= kvm_cpuid_1_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_1_EDX); + entry->ecx &= kvm_cpuid_1_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_1_ECX); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -433,8 +433,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ if (index == 0) { - entry->ebx &= kvm_supported_word9_x86_features; - cpuid_mask(&entry->ebx, 9); + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(&entry->ebx, CPUID_7_0_EBX); // TSC_ADJUST is emulated entry->ebx |= F(TSC_ADJUST); } else @@ -514,7 +514,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, do_cpuid_1_ent(&entry[i], function, idx); if (idx == 1) { - entry[i].eax &= kvm_supported_word10_x86_features; + entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; entry[i].ebx = 0; if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) entry[i].ebx = @@ -564,10 +564,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0x8000001a); break; case 0x80000001: - entry->edx &= kvm_supported_word1_x86_features; - cpuid_mask(&entry->edx, 1); - entry->ecx &= kvm_supported_word6_x86_features; - cpuid_mask(&entry->ecx, 6); + entry->edx &= kvm_cpuid_8000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_8000_0001_EDX); + entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -600,8 +600,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0xC0000004); break; case 0xC0000001: - entry->edx &= kvm_supported_word5_x86_features; - cpuid_mask(&entry->edx, 5); + entry->edx &= kvm_cpuid_C000_0001_edx_x86_features; + cpuid_mask(&entry->edx, CPUID_C000_0001_EDX); break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ From ddba262891494c1337fcbec8910605285012c121 Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Tue, 22 Mar 2016 16:51:15 +0800 Subject: [PATCH 11/27] KVM, pkeys: disable pkeys for guests in non-paging mode Pkeys is disabled if CPU is in non-paging mode in hardware. However KVM always uses paging mode to emulate guest non-paging, mode with TDP. To emulate this behavior, pkeys needs to be manually disabled when guest switches to non-paging mode. Signed-off-by: Huaitong Han Reviewed-by: Xiao Guangrong Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 161230016dfa..1b8a41f64d35 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3893,13 +3893,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!enable_unrestricted_guest && !is_paging(vcpu)) /* - * SMEP/SMAP is disabled if CPU is in non-paging mode in - * hardware. However KVM always uses paging mode without - * unrestricted guest. - * To emulate this behavior, SMEP/SMAP needs to be manually - * disabled when guest switches to non-paging mode. + * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in + * hardware. To emulate this behavior, SMEP/SMAP/PKU needs + * to be manually disabled when guest switches to non-paging + * mode. + * + * If !enable_unrestricted_guest, the CPU is always running + * with CR0.PG=1 and CR4 needs to be modified. + * If enable_unrestricted_guest, the CPU automatically + * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0. */ - hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); + hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); vmcs_writel(CR4_READ_SHADOW, cr4); vmcs_writel(GUEST_CR4, hw_cr4); From 17a511f878505d31f298de443269b3070c134163 Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Tue, 22 Mar 2016 16:51:16 +0800 Subject: [PATCH 12/27] KVM, pkeys: add pkeys support for xsave state This patch adds pkeys support for xsave state. Signed-off-by: Huaitong Han Reviewed-by: Xiao Guangrong Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 007940faa5c6..7ce3634ab5fe 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void); #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ - | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512) + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) extern u64 host_xcr0; extern u64 kvm_supported_xcr0(void); From 9e90199c25aec31b4509213881511948f6c763c8 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 22 Mar 2016 16:51:17 +0800 Subject: [PATCH 13/27] x86: pkey: introduce write_pkru() for KVM KVM will use it to switch pkru between guest and host. CC: Ingo Molnar CC: Dave Hansen Signed-off-by: Xiao Guangrong Signed-off-by: Huaitong Han Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/pgtable.h | 6 ++++++ arch/x86/include/asm/special_insns.h | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1ff49ec29ece..97f3242e133c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -107,6 +107,12 @@ static inline u32 read_pkru(void) return 0; } +static inline void write_pkru(u32 pkru) +{ + if (boot_cpu_has(X86_FEATURE_OSPKE)) + __write_pkru(pkru); +} + static inline int pte_young(pte_t pte) { return pte_flags(pte) & _PAGE_ACCESSED; diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index aee6e76e561e..d96d04377765 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -113,11 +113,27 @@ static inline u32 __read_pkru(void) : "c" (ecx)); return pkru; } + +static inline void __write_pkru(u32 pkru) +{ + u32 ecx = 0, edx = 0; + + /* + * "wrpkru" instruction. Loads contents in EAX to PKRU, + * requires that ecx = edx = 0. + */ + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (pkru), "c"(ecx), "d"(edx)); +} #else static inline u32 __read_pkru(void) { return 0; } + +static inline void __write_pkru(u32 pkru) +{ +} #endif static inline void native_wbinvd(void) From 1be0e61c1f255faaeab04a390e00c8b9b9042870 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 22 Mar 2016 16:51:18 +0800 Subject: [PATCH 14/27] KVM, pkeys: save/restore PKRU when guest/host switches Currently XSAVE state of host is not restored after VM-exit and PKRU is managed by XSAVE so the PKRU from guest is still controlling the memory access even if the CPU is running the code of host. This is not safe as KVM needs to access the memory of userspace (e,g QEMU) to do some emulation. So we save/restore PKRU when guest/host switches. Signed-off-by: Huaitong Han Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1b8a41f64d35..87664775b51e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -598,6 +598,10 @@ struct vcpu_vmx { struct page *pml_pg; u64 current_tsc_ratio; + + bool guest_pkru_valid; + u32 guest_pkru; + u32 host_pkru; }; enum segment_cache_field { @@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg(&pi_desc->control, old.control, new.control) != old.control); } + /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes * vcpu mutex is already taken. @@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } vmx_vcpu_pi_load(vcpu, cpu); + vmx->host_pkru = read_pkru(); } static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -8639,6 +8645,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + if (vmx->guest_pkru_valid) + __write_pkru(vmx->guest_pkru); + atomic_switch_perf_msrs(vmx); debugctlmsr = get_debugctlmsr(); @@ -8778,6 +8787,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + /* + * eager fpu is enabled if PKEY is supported and CR4 is switched + * back on host, so it is safe to read guest PKRU from current + * XSAVE. + */ + if (boot_cpu_has(X86_FEATURE_OSPKE)) { + vmx->guest_pkru = __read_pkru(); + if (vmx->guest_pkru != vmx->host_pkru) { + vmx->guest_pkru_valid = true; + __write_pkru(vmx->host_pkru); + } else + vmx->guest_pkru_valid = false; + } + /* * the KVM_REQ_EVENT optimization bit is only on for one entry, and if * we did not inject a still-pending event to L1 now because of From 2d344105f57ca77fc9c5d4377f65d1082f71ac4b Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Tue, 22 Mar 2016 16:51:19 +0800 Subject: [PATCH 15/27] KVM, pkeys: introduce pkru_mask to cache conditions PKEYS defines a new status bit in the PFEC. PFEC.PK (bit 5), if some conditions is true, the fault is considered as a PKU violation. pkru_mask indicates if we need to check PKRU.ADi and PKRU.WDi, and does cache some conditions for permission_fault. [ Huaitong: Xiao helps to modify many sections. ] Signed-off-by: Huaitong Han Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 ++++ arch/x86/kvm/mmu.c | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 01c8b501cb6d..8968165963e4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -335,6 +335,14 @@ struct kvm_mmu { */ u8 permissions[16]; + /* + * The pkru_mask indicates if protection key checks are needed. It + * consists of 16 domains indexed by page fault error code bits [4:1], + * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. + * Each domain has 2 bits which are ANDed with AD and WD from PKRU. + */ + u32 pkru_mask; + u64 *pae_root; u64 *lm_root; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c512f095cdac..e46a81f4856e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3923,6 +3923,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, } } +/* +* PKU is an additional mechanism by which the paging controls access to +* user-mode addresses based on the value in the PKRU register. Protection +* key violations are reported through a bit in the page fault error code. +* Unlike other bits of the error code, the PK bit is not known at the +* call site of e.g. gva_to_gpa; it must be computed directly in +* permission_fault based on two bits of PKRU, on some machine state (CR4, +* CR0, EFER, CPL), and on other bits of the error code and the page tables. +* +* In particular the following conditions come from the error code, the +* page tables and the machine state: +* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1 +* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch) +* - PK is always zero if U=0 in the page tables +* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access. +* +* The PKRU bitmask caches the result of these four conditions. The error +* code (minus the P bit) and the page table's U bit form an index into the +* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed +* with the two bits of the PKRU register corresponding to the protection key. +* For the first three conditions above the bits will be 00, thus masking +* away both AD and WD. For all reads or if the last condition holds, WD +* only will be masked away. +*/ +static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + bool ept) +{ + unsigned bit; + bool wp; + + if (ept) { + mmu->pkru_mask = 0; + return; + } + + /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */ + if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) { + mmu->pkru_mask = 0; + return; + } + + wp = is_write_protection(vcpu); + + for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) { + unsigned pfec, pkey_bits; + bool check_pkey, check_write, ff, uf, wf, pte_user; + + pfec = bit << 1; + ff = pfec & PFERR_FETCH_MASK; + uf = pfec & PFERR_USER_MASK; + wf = pfec & PFERR_WRITE_MASK; + + /* PFEC.RSVD is replaced by ACC_USER_MASK. */ + pte_user = pfec & PFERR_RSVD_MASK; + + /* + * Only need to check the access which is not an + * instruction fetch and is to a user page. + */ + check_pkey = (!ff && pte_user); + /* + * write access is controlled by PKRU if it is a + * user access or CR0.WP = 1. + */ + check_write = check_pkey && wf && (uf || wp); + + /* PKRU.AD stops both read and write access. */ + pkey_bits = !!check_pkey; + /* PKRU.WD stops write access. */ + pkey_bits |= (!!check_write) << 1; + + mmu->pkru_mask |= (pkey_bits & 3) << pfec; + } +} + static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) { unsigned root_level = mmu->root_level; @@ -3941,6 +4016,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); MMU_WARN_ON(!is_pae(vcpu)); @@ -3968,6 +4044,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); context->page_fault = paging32_page_fault; @@ -4026,6 +4103,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context, false); + update_pkru_bitmask(vcpu, context, false); update_last_nonleaf_level(vcpu, context); reset_tdp_shadow_zero_bits_mask(vcpu, context); } @@ -4078,6 +4156,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->direct_map = false; update_permission_bitmask(vcpu, context, true); + update_pkru_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } @@ -4132,6 +4211,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context, false); + update_pkru_bitmask(vcpu, g_context, false); update_last_nonleaf_level(vcpu, g_context); } From be94f6b71067df47d623fc6c6983a8dee504fb4d Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Tue, 22 Mar 2016 16:51:20 +0800 Subject: [PATCH 16/27] KVM, pkeys: add pkeys support for permission_fault Protection keys define a new 4-bit protection key field (PKEY) in bits 62:59 of leaf entries of the page tables, the PKEY is an index to PKRU register(16 domains), every domain has 2 bits(write disable bit, access disable bit). Static logic has been produced in update_pkru_bitmask, dynamic logic need read pkey from page table entries, get pkru value, and deduce the correct result. [ Huaitong: Xiao helps to modify many sections. ] Signed-off-by: Huaitong Han Signed-off-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/kvm_cache_regs.h | 5 +++++ arch/x86/kvm/mmu.h | 33 ++++++++++++++++++++++++++++----- arch/x86/kvm/paging_tmpl.h | 16 ++++++++++++++-- arch/x86/kvm/svm.c | 8 ++++++++ arch/x86/kvm/vmx.c | 8 ++++++++ arch/x86/kvm/x86.c | 7 ++++++- 7 files changed, 72 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8968165963e4..0d75ecdfa077 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -187,12 +187,14 @@ enum { #define PFERR_USER_BIT 2 #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 +#define PFERR_PK_BIT 5 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) #define PFERR_USER_MASK (1U << PFERR_USER_BIT) #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) +#define PFERR_PK_MASK (1U << PFERR_PK_BIT) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -882,6 +884,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + u32 (*get_pkru)(struct kvm_vcpu *vcpu); void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index e1e89ee4af75..762cdf2595f9 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); } +static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->get_pkru(vcpu); +} + static inline void enter_guest_mode(struct kvm_vcpu *vcpu) { vcpu->arch.hflags |= HF_GUEST_MASK; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 8b2b3dfca1ae..b70df72e2b33 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -10,10 +10,11 @@ #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS) #define PT_WRITABLE_SHIFT 1 +#define PT_USER_SHIFT 2 #define PT_PRESENT_MASK (1ULL << 0) #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT) -#define PT_USER_MASK (1ULL << 2) +#define PT_USER_MASK (1ULL << PT_USER_SHIFT) #define PT_PWT_MASK (1ULL << 3) #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 @@ -149,7 +150,8 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) * if the access faults. */ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - unsigned pte_access, unsigned pfec) + unsigned pte_access, unsigned pte_pkey, + unsigned pfec) { int cpl = kvm_x86_ops->get_cpl(vcpu); unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); @@ -170,11 +172,32 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC); int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + bool fault = (mmu->permissions[index] >> pte_access) & 1; - WARN_ON(pfec & PFERR_RSVD_MASK); - + WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); pfec |= PFERR_PRESENT_MASK; - return -((mmu->permissions[index] >> pte_access) & 1) & pfec; + + if (unlikely(mmu->pkru_mask)) { + u32 pkru_bits, offset; + + /* + * PKRU defines 32 bits, there are 16 domains and 2 + * attribute bits per domain in pkru. pte_pkey is the + * index of the protection domain, so pte_pkey * 2 is + * is the index of the first bit for the domain. + */ + pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3; + + /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */ + offset = pfec - 1 + + ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT)); + + pkru_bits &= mmu->pkru_mask >> offset; + pfec |= -pkru_bits & PFERR_PK_MASK; + fault |= (pkru_bits != 0); + } + + return -(uint32_t)fault & pfec; } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 45ed3c38d20e..e0c225421157 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, return 0; } +static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned pkeys = 0; +#if PTTYPE == 64 + pte_t pte = {.pte = gpte}; + + pkeys = pte_flags_pkey(pte_flags(pte)); +#endif + return pkeys; +} + /* * Fetch a guest pte for a guest virtual address */ @@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty; + unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -359,7 +370,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - errcode = permission_fault(vcpu, mmu, pte_access, access); + pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); + errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 95070386d599..31346a3f20a5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static u32 svm_get_pkru(struct kvm_vcpu *vcpu) +{ + return 0; +} + static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { switch (reg) { @@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + + .get_pkru = svm_get_pkru, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 87664775b51e..efc243e4dabf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2292,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } +static u32 vmx_get_pkru(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->guest_pkru; +} + static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); @@ -10925,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + + .get_pkru = vmx_get_pkru, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7236bd3a4c3d..d0a5918da654 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4326,9 +4326,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); + /* + * currently PKRU is only applied to ept enabled guest so + * there is no pkey in EPT page table for L1 guest or EPT + * shadow page table for L2 guest. + */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, access)) { + vcpu->arch.access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); From b9baba861489041b37b54fc7ee0b0006b5327151 Mon Sep 17 00:00:00 2001 From: Huaitong Han Date: Tue, 22 Mar 2016 16:51:21 +0800 Subject: [PATCH 17/27] KVM, pkeys: expose CPUID/CR4 to guest X86_FEATURE_PKU is referred to as "PKU" in the hardware documentation: CPUID.7.0.ECX[3]:PKU. X86_FEATURE_OSPKE is software support for pkeys, enumerated with CPUID.7.0.ECX[4]:OSPKE, and it reflects the setting of CR4.PKE(bit 22). This patch disables CPUID:PKU without ept, because pkeys is not yet implemented for shadow paging. Signed-off-by: Huaitong Han Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/cpuid.c | 23 +++++++++++++++++++++-- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/x86.c | 9 ++++++--- 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0d75ecdfa077..f62a9f37f79f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -84,7 +84,8 @@ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP)) + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \ + | X86_CR4_PKE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 00a69ecdd401..8efb839948e5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) apic->lapic_timer.timer_mode_mask = 1 << 17; } + best = kvm_find_cpuid_entry(vcpu, 7, 0); + if (best) { + /* Update OSPKE bit */ + if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) { + best->ecx &= ~F(OSPKE); + if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) + best->ecx |= F(OSPKE); + } + } + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { vcpu->arch.guest_supported_xcr0 = 0; @@ -360,6 +370,9 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; + /* cpuid 7.0.ecx*/ + const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -437,10 +450,16 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, cpuid_mask(&entry->ebx, CPUID_7_0_EBX); // TSC_ADJUST is emulated entry->ebx |= F(TSC_ADJUST); - } else + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled) + entry->ecx &= ~F(PKU); + } else { entry->ebx = 0; + entry->ecx = 0; + } entry->eax = 0; - entry->ecx = 0; entry->edx = 0; break; } diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 66a6581724ad..e17a74b1d852 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ecx & bit(X86_FEATURE_PKU)); +} + static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d0a5918da654..98ae1fb9fd66 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP; + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; if (cr4 & CR4_RESERVED_BITS) return 1; @@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE)) return 1; + if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; @@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); return 0; @@ -7128,7 +7131,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & X86_CR4_OSXSAVE) + if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); From 9753f529156d03112d1ff6f3758ccd6fa6faf189 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sun, 13 Mar 2016 11:10:24 +0800 Subject: [PATCH 18/27] KVM: Remove redundant smp_mb() in the kvm_mmu_commit_zap_page() There is already a barrier inside of kvm_flush_remote_tlbs() which can help to make sure everyone sees our modifications to the page tables and see changes to vcpu->mode here. So remove the smp_mb in the kvm_mmu_commit_zap_page() and update the comment. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e46a81f4856e..fc8ab1f67d00 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2390,14 +2390,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, return; /* - * wmb: make sure everyone sees our modifications to the page tables - * rmb: make sure we see changes to vcpu->mode - */ - smp_mb(); - - /* - * Wait for all vcpus to exit guest mode and/or lockless shadow - * page table walks. + * We need to make sure everyone sees our modifications to + * the page tables and see changes to vcpu->mode here. The barrier + * in the kvm_flush_remote_tlbs() achieves this. This pairs + * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end. + * + * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit + * guest mode and/or lockless shadow page table walks. */ kvm_flush_remote_tlbs(kvm); From 36ca7e0a577aa61f648953886958e50398732d63 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sun, 13 Mar 2016 11:10:25 +0800 Subject: [PATCH 19/27] KVM/x86: Replace smp_mb() with smp_store_mb/release() in the walk_shadow_page_lockless_begin/end() Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fc8ab1f67d00..6bdfbc23ecaa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) * kvm_flush_remote_tlbs() IPI to all active vcpus. */ local_irq_disable(); - vcpu->mode = READING_SHADOW_PAGE_TABLES; + /* * Make sure a following spte read is not reordered ahead of the write * to vcpu->mode. */ - smp_mb(); + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) @@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) * reads to sptes. If it does, kvm_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ - smp_mb(); - vcpu->mode = OUTSIDE_GUEST_MODE; + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); } From a30a0509165d9bc5a3107894338e6adf4be7b60f Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sun, 13 Mar 2016 11:10:26 +0800 Subject: [PATCH 20/27] KVM: Replace smp_mb() with smp_mb_after_atomic() in the kvm_make_all_cpus_request() Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index db021c383aa7..402590dfcf9b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -170,8 +170,8 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) kvm_make_request(req, vcpu); cpu = vcpu->cpu; - /* Set ->requests bit before we read ->mode */ - smp_mb(); + /* Set ->requests bit before we read ->mode. */ + smp_mb__after_atomic(); if (cpus != NULL && cpu != -1 && cpu != me && kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) From 7bfdf2177812c30928bea3fc8bc86b9dea236f65 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sun, 13 Mar 2016 11:10:27 +0800 Subject: [PATCH 21/27] KVM/x86: Call smp_wmb() before increasing tlbs_dirty Update spte before increasing tlbs_dirty to make sure no tlb flush in lost after spte is zapped. This pairs with the barrier in the kvm_flush_remote_tlbs(). Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/paging_tmpl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e0c225421157..1d971c7553c3 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -960,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + /* + * Update spte before increasing tlbs_dirty to make + * sure no tlb flush is lost after spte is zapped; see + * the comments in kvm_flush_remote_tlbs(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } @@ -975,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (gfn != sp->gfns[i]) { drop_spte(vcpu->kvm, &sp->spt[i]); + /* + * The same as above where we are doing + * prefetch_invalid_gpte(). + */ + smp_wmb(); vcpu->kvm->tlbs_dirty++; continue; } From 4ae3cb3a2551b41f22284f713e7d5e2b61a85c1d Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sun, 13 Mar 2016 11:10:28 +0800 Subject: [PATCH 22/27] KVM: Replace smp_mb() with smp_load_acquire() in the kvm_flush_remote_tlbs() smp_load_acquire() is enough here and it's cheaper than smp_mb(). Adding a comment about reusing memory barrier of kvm_make_all_cpus_request() here to keep order between modifications to the page tables and reading mode. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 402590dfcf9b..4fd482fb9260 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -191,9 +191,23 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL void kvm_flush_remote_tlbs(struct kvm *kvm) { - long dirty_count = kvm->tlbs_dirty; + /* + * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in + * kvm_make_all_cpus_request. + */ + long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); - smp_mb(); + /* + * We want to publish modifications to the page tables before reading + * mode. Pairs with a memory barrier in arch-specific code. + * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest + * and smp_mb in walk_shadow_page_lockless_begin/end. + * - powerpc: smp_mb in kvmppc_prepare_to_enter. + * + * There is already an smp_mb__after_atomic() before + * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that + * barrier here. + */ if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) ++kvm->stat.remote_tlb_flush; cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); From 0f127d12e44c13d2f90e5c613490e70ddb1a5e08 Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sun, 13 Mar 2016 11:10:29 +0800 Subject: [PATCH 23/27] KVM/x86: update the comment of memory barrier in the vcpu_enter_guest() The barrier also orders the write to mode from any reads to the page tables done and so update the comment. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 98ae1fb9fd66..e260ccbc8f55 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6596,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - /* We should set ->mode before check ->requests, - * see the comment in make_all_cpus_request. + /* + * We should set ->mode before check ->requests, + * Please see the comment in kvm_make_all_cpus_request. + * This also orders the write to mode from any reads + * to the page tables done while the VCPU is running. + * Please see the comment in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); From 489153c746746fc44b1e2fd184248dee8aebbaff Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Sun, 13 Mar 2016 11:10:30 +0800 Subject: [PATCH 24/27] KVM/PPC: update the comment of memory barrier in the kvmppc_prepare_to_enter() The barrier also orders the write to mode from any reads to the page tables done and so update the comment. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/powerpc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 19aa59b0850c..6a68730774ee 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -96,6 +96,9 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) * so we don't miss a request because the requester sees * OUTSIDE_GUEST_MODE and assumes we'll be checking requests * before next entering the guest (and thus doesn't IPI). + * This also orders the write to mode from any reads + * to the page tables done while the VCPU is running. + * Please see the comment in kvm_flush_remote_tlbs. */ smp_mb(); From 9db284f30311b741a5e74eb8e5800ff7b92c4be3 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 21 Mar 2016 15:13:27 +0100 Subject: [PATCH 25/27] kvm, rt: change async pagefault code locking for PREEMPT_RT The async pagefault wake code can run from the idle task in exception context, so everything here needs to be made non-preemptible. Conversion to a simple wait queue and raw spinlock does the trick. Signed-off-by: Rik van Riel Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 47190bd399e7..807950860fb7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -91,14 +92,14 @@ static void kvm_io_delay(void) struct kvm_task_sleep_node { struct hlist_node link; - wait_queue_head_t wq; + struct swait_queue_head wq; u32 token; int cpu; bool halted; }; static struct kvm_task_sleep_head { - spinlock_t lock; + raw_spinlock_t lock; struct hlist_head list; } async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE]; @@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token) u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; struct kvm_task_sleep_node n, *e; - DEFINE_WAIT(wait); + DECLARE_SWAITQUEUE(wait); rcu_irq_enter(); - spin_lock(&b->lock); + raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); kfree(e); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); rcu_irq_exit(); return; @@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); n.halted = is_idle_task(current) || preempt_count() > 1; - init_waitqueue_head(&n.wq); + init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); for (;;) { if (!n.halted) - prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; @@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token) } } if (!n.halted) - finish_wait(&n.wq, &wait); + finish_swait(&n.wq, &wait); rcu_irq_exit(); return; @@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (waitqueue_active(&n->wq)) - wake_up(&n->wq); + else if (swait_active(&n->wq)) + swake_up(&n->wq); } static void apf_task_wake_all(void) @@ -189,14 +190,14 @@ static void apf_task_wake_all(void) for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; - spin_lock(&b->lock); + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { struct kvm_task_sleep_node *n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); } } @@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token) } again: - spin_lock(&b->lock); + raw_spin_lock(&b->lock); n = _find_apf_task(b, token); if (!n) { /* @@ -225,17 +226,17 @@ void kvm_async_pf_task_wake(u32 token) * Allocation failed! Busy wait while other cpu * handles async PF. */ - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); cpu_relax(); goto again; } n->token = token; n->cpu = smp_processor_id(); - init_waitqueue_head(&n->wq); + init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); } else apf_task_wake_one(n); - spin_unlock(&b->lock); + raw_spin_unlock(&b->lock); return; } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); @@ -486,7 +487,7 @@ void __init kvm_guest_init(void) paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) - spin_lock_init(&async_pf_sleepers[i].lock); + raw_spin_lock_init(&async_pf_sleepers[i].lock); if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) x86_init.irqs.trap_init = kvm_apf_trap_init; From 0af574be32cddf8d0c2fbb194dcc635e73a83f1e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 21 Mar 2016 15:05:17 +0100 Subject: [PATCH 26/27] KVM: PPC: do not compile in vfio.o unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Build on 32-bit PPC fails with the following error: int kvm_vfio_ops_init(void) ^ In file included from arch/powerpc/kvm/../../../virt/kvm/vfio.c:21:0: arch/powerpc/kvm/../../../virt/kvm/vfio.h:8:90: note: previous definition of ‘kvm_vfio_ops_init’ was here arch/powerpc/kvm/../../../virt/kvm/vfio.c:292:6: error: redefinition of ‘kvm_vfio_ops_exit’ void kvm_vfio_ops_exit(void) ^ In file included from arch/powerpc/kvm/../../../virt/kvm/vfio.c:21:0: arch/powerpc/kvm/../../../virt/kvm/vfio.h:12:91: note: previous definition of ‘kvm_vfio_ops_exit’ was here scripts/Makefile.build:258: recipe for target arch/powerpc/kvm/../../../virt/kvm/vfio.o failed make[3]: *** [arch/powerpc/kvm/../../../virt/kvm/vfio.o] Error 1 Check whether CONFIG_KVM_VFIO is set before including vfio.o in the build. Reported-by: Pranith Kumar Tested-by: Pranith Kumar Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 7f7b6d86ac73..eba0bea6e032 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -8,7 +8,8 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm KVM := ../../../virt/kvm common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ - $(KVM)/eventfd.o $(KVM)/vfio.o + $(KVM)/eventfd.o +common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o CFLAGS_e500_mmu.o := -I. CFLAGS_e500_mmu_host.o := -I. From a6adb106225f9e2f177d3d883596e011df321965 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 22 Mar 2016 17:25:42 +0100 Subject: [PATCH 27/27] KVM: page_track: fix access to NULL slot This happens when doing the reboot test from virt-tests: [ 131.833653] BUG: unable to handle kernel NULL pointer dereference at (null) [ 131.842461] IP: [] kvm_page_track_is_active+0x17/0x60 [kvm] [ 131.850500] PGD 0 [ 131.852763] Oops: 0000 [#1] SMP [ 132.007188] task: ffff880075fbc500 ti: ffff880850a3c000 task.ti: ffff880850a3c000 [ 132.138891] Call Trace: [ 132.141639] [] page_fault_handle_page_track+0x31/0x40 [kvm] [ 132.149732] [] paging64_page_fault+0xff/0x910 [kvm] [ 132.172159] [] kvm_mmu_page_fault+0x64/0x110 [kvm] [ 132.179372] [] handle_exception+0x1b2/0x430 [kvm_intel] [ 132.187072] [] vmx_handle_exit+0x1e1/0xc50 [kvm_intel] ... Cc: Xiao Guangrong Fixes: 3d0c27ad6ee465f174b09ee99fcaf189c57d567a Signed-off-by: Paolo Bonzini --- arch/x86/kvm/page_track.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index 11f76436f74f..b431539c3714 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, enum kvm_page_track_mode mode) { - struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + struct kvm_memory_slot *slot; + int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!slot) + return false; + + index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); }