From 425333bf3a7743715c17e503049d0837d6c4a603 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 10 Sep 2018 18:29:07 +1000 Subject: [PATCH 01/23] KVM: PPC: Avoid marking DMA-mapped pages dirty in real mode At the moment the real mode handler of H_PUT_TCE calls iommu_tce_xchg_rm() which in turn reads the old TCE and if it was a valid entry, marks the physical page dirty if it was mapped for writing. Since it is in real mode, realmode_pfn_to_page() is used instead of pfn_to_page() to get the page struct. However SetPageDirty() itself reads the compound page head and returns a virtual address for the head page struct and setting dirty bit for that kills the system. This adds additional dirty bit tracking into the MM/IOMMU API for use in the real mode. Note that this does not change how VFIO and KVM (in virtual mode) set this bit. The KVM (real mode) changes include: - use the lowest bit of the cached host phys address to carry the dirty bit; - mark pages dirty when they are unpinned which happens when the preregistered memory is released which always happens in virtual mode; - add mm_iommu_ua_mark_dirty_rm() helper to set delayed dirty bit; - change iommu_tce_xchg_rm() to take the kvm struct for the mm to use in the new mm_iommu_ua_mark_dirty_rm() helper; - move iommu_tce_xchg_rm() to book3s_64_vio_hv.c (which is the only caller anyway) to reduce the real mode KVM and IOMMU knowledge across different subsystems. This removes realmode_pfn_to_page() as it is not used anymore. While we at it, remove some EXPORT_SYMBOL_GPL() as that code is for the real mode only and modules cannot call it anyway. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 - arch/powerpc/include/asm/iommu.h | 2 - arch/powerpc/include/asm/mmu_context.h | 1 + arch/powerpc/kernel/iommu.c | 25 ---------- arch/powerpc/kvm/book3s_64_vio_hv.c | 39 ++++++++++++---- arch/powerpc/mm/init_64.c | 49 -------------------- arch/powerpc/mm/mmu_context_iommu.c | 34 ++++++++++++-- 7 files changed, 62 insertions(+), 89 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 13a688fc8cd0..2fdc865ca374 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1051,7 +1051,6 @@ static inline void vmemmap_remove_mapping(unsigned long start, return hash__vmemmap_remove_mapping(start, page_size); } #endif -struct page *realmode_pfn_to_page(unsigned long pfn); static inline pte_t pmd_pte(pmd_t pmd) { diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index ab3a4fba38e3..3d4b88cb8599 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -220,8 +220,6 @@ extern void iommu_del_device(struct device *dev); extern int __init tce_iommu_bus_notifier_init(void); extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction); -extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b2f89b621b15..b694d6af1150 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -38,6 +38,7 @@ extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned int pageshift, unsigned long *hpa); +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index af7a20dc6e09..19b4c628f3be 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1013,31 +1013,6 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, } EXPORT_SYMBOL_GPL(iommu_tce_xchg); -#ifdef CONFIG_PPC_BOOK3S_64 -long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret; - - ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); - - if (!ret && ((*direction == DMA_FROM_DEVICE) || - (*direction == DMA_BIDIRECTIONAL))) { - struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT); - - if (likely(pg)) { - SetPageDirty(pg); - } else { - tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); - ret = -EFAULT; - } - } - - return ret; -} -EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm); -#endif - int iommu_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 506a4d400458..6821ead4b4eb 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -187,12 +187,35 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) +static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) +{ + long ret; + + ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); + + if (!ret && ((*direction == DMA_FROM_DEVICE) || + (*direction == DMA_BIDIRECTIONAL))) { + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + /* + * kvmppc_rm_tce_iommu_do_map() updates the UA cache after + * calling this so we still get here a valid UA. + */ + if (pua && *pua) + mm_iommu_ua_mark_dirty_rm(mm, be64_to_cpu(*pua)); + } + + return ret; +} + +static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry) { unsigned long hpa = 0; enum dma_data_direction dir = DMA_NONE; - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); } static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, @@ -224,7 +247,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, unsigned long hpa = 0; long ret; - if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) + if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir)) /* * real mode xchg can fail if struct page crosses * a page boundary @@ -236,7 +259,7 @@ static long kvmppc_rm_tce_iommu_do_unmap(struct kvm *kvm, ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); if (ret) - iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); return ret; } @@ -282,7 +305,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) return H_CLOSED; - ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); if (ret) { mm_iommu_mapped_dec(mem); /* @@ -371,7 +394,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, return ret; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); } kvmppc_tce_put(stt, entry, tce); @@ -520,7 +543,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, goto unlock_exit; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); } kvmppc_tce_put(stt, entry + i, tce); @@ -571,7 +594,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, return ret; WARN_ON_ONCE_RM(1); - kvmppc_rm_clear_tce(stit->tbl, entry); + kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); } } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 51ce091914f9..7a9886f98b0c 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -308,55 +308,6 @@ void register_page_bootmem_memmap(unsigned long section_nr, { } -/* - * We do not have access to the sparsemem vmemmap, so we fallback to - * walking the list of sparsemem blocks which we already maintain for - * the sake of crashdump. In the long run, we might want to maintain - * a tree if performance of that linear walk becomes a problem. - * - * realmode_pfn_to_page functions can fail due to: - * 1) As real sparsemem blocks do not lay in RAM continously (they - * are in virtual address space which is not available in the real mode), - * the requested page struct can be split between blocks so get_page/put_page - * may fail. - * 2) When huge pages are used, the get_page/put_page API will fail - * in real mode as the linked addresses in the page struct are virtual - * too. - */ -struct page *realmode_pfn_to_page(unsigned long pfn) -{ - struct vmemmap_backing *vmem_back; - struct page *page; - unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; - unsigned long pg_va = (unsigned long) pfn_to_page(pfn); - - for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) { - if (pg_va < vmem_back->virt_addr) - continue; - - /* After vmemmap_list entry free is possible, need check all */ - if ((pg_va + sizeof(struct page)) <= - (vmem_back->virt_addr + page_size)) { - page = (struct page *) (vmem_back->phys + pg_va - - vmem_back->virt_addr); - return page; - } - } - - /* Probably that page struct is split between real pages */ - return NULL; -} -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); - -#else - -struct page *realmode_pfn_to_page(unsigned long pfn) -{ - struct page *page = pfn_to_page(pfn); - return page; -} -EXPORT_SYMBOL_GPL(realmode_pfn_to_page); - #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index c9ee9e23845f..56c2234cc6ae 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -18,11 +18,15 @@ #include #include #include +#include #include #include static DEFINE_MUTEX(mem_list_mutex); +#define MM_IOMMU_TABLE_GROUP_PAGE_DIRTY 0x1 +#define MM_IOMMU_TABLE_GROUP_PAGE_MASK ~(SZ_4K - 1) + struct mm_iommu_table_group_mem_t { struct list_head next; struct rcu_head rcu; @@ -263,6 +267,9 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) if (!page) continue; + if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) + SetPageDirty(page); + put_page(page); mem->hpas[i] = 0; } @@ -360,7 +367,6 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, return ret; } -EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, unsigned long ua, unsigned long entries) @@ -390,7 +396,7 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, if (pageshift > mem->pageshift) return -EFAULT; - *hpa = *va | (ua & ~PAGE_MASK); + *hpa = (*va & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); return 0; } @@ -413,11 +419,31 @@ long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, if (!pa) return -EFAULT; - *hpa = *pa | (ua & ~PAGE_MASK); + *hpa = (*pa & MM_IOMMU_TABLE_GROUP_PAGE_MASK) | (ua & ~PAGE_MASK); return 0; } -EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); + +extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua) +{ + struct mm_iommu_table_group_mem_t *mem; + long entry; + void *va; + unsigned long *pa; + + mem = mm_iommu_lookup_rm(mm, ua, PAGE_SIZE); + if (!mem) + return; + + entry = (ua - mem->ua) >> PAGE_SHIFT; + va = &mem->hpas[entry]; + + pa = (void *) vmalloc_to_phys(va); + if (!pa) + return; + + *pa |= MM_IOMMU_TABLE_GROUP_PAGE_DIRTY; +} long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) { From 71d29f43b6332badc5598c656616a62575e83342 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 11 Sep 2018 20:48:34 +1000 Subject: [PATCH 02/23] KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size THP paths can defer splitting compound pages until after the actual remap and TLB flushes to split a huge PMD/PUD. This causes radix partition scope page table mappings to get out of synch with the host qemu page table mappings. This results in random memory corruption in the guest when running with THP. The easiest way to reproduce is use KVM balloon to free up a lot of memory in the guest and then shrink the balloon to give the memory back, while some work is being done in the guest. Cc: David Gibson Cc: "Aneesh Kumar K.V" Cc: kvm-ppc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 91 +++++++++++--------------- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index fd6e8c13685f..933c574e1cf7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -525,8 +525,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long mmu_seq, pte_size; - unsigned long gpa, gfn, hva, pfn; + unsigned long mmu_seq; + unsigned long gpa, gfn, hva; struct kvm_memory_slot *memslot; struct page *page = NULL; long ret; @@ -623,9 +623,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ hva = gfn_to_hva_memslot(memslot, gfn); if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { - pfn = page_to_pfn(page); upgrade_write = true; } else { + unsigned long pfn; + /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, writing, upgrade_p); @@ -639,61 +640,43 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } } - /* See if we can insert a 1GB or 2MB large PTE here */ - level = 0; - if (page && PageCompound(page)) { - pte_size = PAGE_SIZE << compound_order(compound_head(page)); - if (pte_size >= PUD_SIZE && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); - } else if (pte_size >= PMD_SIZE && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); + /* + * Read the PTE from the process' radix tree and use that + * so we get the shift and attribute bits. + */ + local_irq_disable(); + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + pte = *ptep; + local_irq_enable(); + + /* Get pte level from shift/size */ + if (shift == PUD_SHIFT && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + } else if (shift == PMD_SHIFT && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; + } else { + level = 0; + if (shift > PAGE_SHIFT) { + /* + * If the pte maps more than one page, bring over + * bits from the virtual address to get the real + * address of the specific single page we want. + */ + unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; + pte = __pte(pte_val(pte) | (hva & rpnmask)); } } - /* - * Compute the PTE value that we need to insert. - */ - if (page) { - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | - _PAGE_ACCESSED; - if (writing || upgrade_write) - pgflags |= _PAGE_WRITE | _PAGE_DIRTY; - pte = pfn_pte(pfn, __pgprot(pgflags)); + pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); + if (writing || upgrade_write) { + if (pte_val(pte) & _PAGE_WRITE) + pte = __pte(pte_val(pte) | _PAGE_DIRTY); } else { - /* - * Read the PTE from the process' radix tree and use that - * so we get the attribute bits. - */ - local_irq_disable(); - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); - pte = *ptep; - local_irq_enable(); - if (shift == PUD_SHIFT && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - } else if (shift == PMD_SHIFT && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - } else if (shift && shift != PAGE_SHIFT) { - /* Adjust PFN */ - unsigned long mask = (1ul << shift) - PAGE_SIZE; - pte = __pte(pte_val(pte) | (hva & mask)); - } - pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); - if (writing || upgrade_write) { - if (pte_val(pte) & _PAGE_WRITE) - pte = __pte(pte_val(pte) | _PAGE_DIRTY); - } else { - pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); - } + pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); } /* Allocate space in the tree and write the PTE */ From 1843abd03250115af6cec0892683e70cf2297c25 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Thu, 16 Aug 2018 09:02:31 +0100 Subject: [PATCH 03/23] s390/mm: Check for valid vma before zapping in gmap_discard Userspace could have munmapped the area before doing unmapping from the gmap. This would leave us with a valid vmaddr, but an invalid vma from which we would try to zap memory. Let's check before using the vma. Fixes: 1e133ab296f3 ("s390/mm: split arch/s390/mm/pgtable.c") Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Reported-by: Dan Carpenter Message-Id: <20180816082432.78828-1-frankja@linux.ibm.com> Signed-off-by: Janosch Frank --- arch/s390/mm/gmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index bb44990c8212..911c7ded35f1 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -708,11 +708,13 @@ void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to) vmaddr |= gaddr & ~PMD_MASK; /* Find vma in the parent mm */ vma = find_vma(gmap->mm, vmaddr); + if (!vma) + continue; /* * We do not discard pages that are backed by * hugetlbfs, so we don't have to refault them. */ - if (vma && is_vm_hugetlb_page(vma)) + if (is_vm_hugetlb_page(vma)) continue; size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK)); zap_page_range(vma, vmaddr, size); From 40ebdb8e59df36e2cc71810bd021a0808b16c956 Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Wed, 1 Aug 2018 11:48:28 +0100 Subject: [PATCH 04/23] KVM: s390: Make huge pages unavailable in ucontrol VMs We currently do not notify all gmaps when using gmap_pmdp_xchg(), due to locking constraints. This makes ucontrol VMs, which is the only VM type that creates multiple gmaps, incompatible with huge pages. Also we would need to hold the guest_table_lock of all gmaps that have this vmaddr maped to synchronize access to the pmd. ucontrol VMs are rather exotic and creating a new locking concept is no easy task. Hence we return EINVAL when trying to active KVM_CAP_S390_HPAGE_1M and report it as being not available when checking for it. Fixes: a4499382 ("KVM: s390: Add huge page enablement control") Signed-off-by: Janosch Frank Reviewed-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Message-Id: <20180801112508.138159-1-frankja@linux.ibm.com> Signed-off-by: Janosch Frank --- Documentation/virtual/kvm/api.txt | 3 ++- arch/s390/kvm/kvm-s390.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c664064f76fb..8d8a372c8340 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4510,7 +4510,8 @@ Do not enable KVM_FEATURE_PV_UNHALT if you disable HLT exits. Architectures: s390 Parameters: none Returns: 0 on success, -EINVAL if hpage module parameter was not set - or cmma is enabled + or cmma is enabled, or the VM has the KVM_VM_S390_UCONTROL + flag set With this capability the KVM support for memory backing with 1m pages through hugetlbfs can be enabled for a VM. After the capability is diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index f69333fd2fa3..ac5da6b0b862 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -481,7 +481,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_S390_HPAGE_1M: r = 0; - if (hpage) + if (hpage && !kvm_is_ucontrol(kvm)) r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -691,7 +691,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) mutex_lock(&kvm->lock); if (kvm->created_vcpus) r = -EBUSY; - else if (!hpage || kvm->arch.use_cmma) + else if (!hpage || kvm->arch.use_cmma || kvm_is_ucontrol(kvm)) r = -EINVAL; else { r = 0; From d1766202779e81d0f2a94c4650a6ba31497d369d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 2 Aug 2018 17:08:16 +0200 Subject: [PATCH 05/23] x86/kvm/lapic: always disable MMIO interface in x2APIC mode When VMX is used with flexpriority disabled (because of no support or if disabled with module parameter) MMIO interface to lAPIC is still available in x2APIC mode while it shouldn't be (kvm-unit-tests): PASS: apic_disable: Local apic enabled in x2APIC mode PASS: apic_disable: CPUID.1H:EDX.APIC[bit 9] is set FAIL: apic_disable: *0xfee00030: 50014 The issue appears because we basically do nothing while switching to x2APIC mode when APIC access page is not used. apic_mmio_{read,write} only check if lAPIC is disabled before proceeding to actual write. When APIC access is virtualized we correctly manipulate with VMX controls in vmx_set_virtual_apic_mode() and we don't get vmexits from memory writes in x2APIC mode so there's no issue. Disabling MMIO interface seems to be easy. The question is: what do we do with these reads and writes? If we add apic_x2apic_mode() check to apic_mmio_in_range() and return -EOPNOTSUPP these reads and writes will go to userspace. When lAPIC is in kernel, Qemu uses this interface to inject MSIs only (see kvm_apic_mem_write() in hw/i386/kvm/apic.c). This somehow works with disabled lAPIC but when we're in xAPIC mode we will get a real injected MSI from every write to lAPIC. Not good. The simplest solution seems to be to just ignore writes to the region and return ~0 for all reads when we're in x2APIC mode. This is what this patch does. However, this approach is inconsistent with what currently happens when flexpriority is enabled: we allocate APIC access page and create KVM memory region so in x2APIC modes all reads and writes go to this pre-allocated page which is, btw, the same for all vCPUs. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/lapic.c | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 86299efa804a..fd23d5778ea1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -377,6 +377,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 17c0472c5b34..fbb0e6df121b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1344,9 +1344,8 @@ EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { - return kvm_apic_hw_enabled(apic) && - addr >= apic->base_address && - addr < apic->base_address + LAPIC_MMIO_LENGTH; + return addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; } static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, @@ -1358,6 +1357,15 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + memset(data, 0xff, len); + return 0; + } + kvm_lapic_reg_read(apic, offset, len, data); return 0; @@ -1917,6 +1925,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + return 0; + } + /* * APIC register must be aligned on 128-bits boundary. * 32/64/128 bits registers must be accessed thru 32 bits. From d35b34a9a70edae7ef923f100e51b8b5ae9fe899 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Wed, 8 Aug 2018 17:45:24 -0700 Subject: [PATCH 06/23] kvm: mmu: Don't read PDPTEs when paging is not enabled kvm should not attempt to read guest PDPTEs when CR0.PG = 0 and CR4.PAE = 1. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 542f6315444d..5c870203737f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -628,7 +628,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) gfn_t gfn; int r; - if (is_long_mode(vcpu) || !is_pae(vcpu)) + if (is_long_mode(vcpu) || !is_pae(vcpu) || !is_paging(vcpu)) return false; if (!test_bit(VCPU_EXREG_PDPTR, @@ -8177,7 +8177,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); - if (!is_long_mode(vcpu) && is_pae(vcpu)) { + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } From 83b20b28c670868bfb717be4fe1557c925a81657 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 7 Sep 2018 19:59:47 +0800 Subject: [PATCH 07/23] KVM: x86: don't reset root in kvm_mmu_setup() Here is the code path which shows kvm_mmu_setup() is invoked after kvm_mmu_create(). Since kvm_mmu_setup() is only invoked in this code path, this means the root_hpa and prev_roots are guaranteed to be invalid. And it is not necessary to reset it again. kvm_vm_ioctl_create_vcpu() kvm_arch_vcpu_create() vmx_create_vcpu() kvm_vcpu_init() kvm_arch_vcpu_init() kvm_mmu_create() kvm_arch_vcpu_setup() kvm_mmu_setup() kvm_init_mmu() This patch set reset_roots to false in kmv_mmu_setup(). Fixes: 50c28f21d045dde8c52548f8482d456b3f0956f5 Signed-off-by: Wei Yang Reviewed-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e24ea7067373..5402c53a079b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5417,7 +5417,12 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) { MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - kvm_init_mmu(vcpu, true); + /* + * kvm_mmu_setup() is called only on vCPU initialization. + * Therefore, no need to reset mmu roots as they are not yet + * initialized. + */ + kvm_init_mmu(vcpu, false); } static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, From 6bd317d3c865ebcddcb287a424093fe4758c40ef Mon Sep 17 00:00:00 2001 From: Lei Yang Date: Wed, 29 Aug 2018 15:04:08 +0800 Subject: [PATCH 08/23] kvm: selftests: use -pthread instead of -lpthread I run into the following error testing/selftests/kvm/dirty_log_test.c:285: undefined reference to `pthread_create' testing/selftests/kvm/dirty_log_test.c:297: undefined reference to `pthread_join' collect2: error: ld returned 1 exit status my gcc version is gcc version 4.8.4 "-pthread" would work everywhere Signed-off-by: Lei Yang Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 03b0f551bedf..48c970c90353 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -20,7 +20,7 @@ INSTALL_HDR_PATH = $(top_srcdir)/usr LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ LINUX_TOOL_INCLUDE = $(top_srcdir)tools/include CFLAGS += -O2 -g -std=gnu99 -I$(LINUX_TOOL_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude -I$( Date: Fri, 7 Sep 2018 05:45:02 +0000 Subject: [PATCH 09/23] KVM/MMU: Fix comment in walk_shadow_page_lockless_end() kvm_commit_zap_page() has been renamed to kvm_mmu_commit_zap_page() This patch is to fix the commit. Signed-off-by: Lan Tianyu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5402c53a079b..d7e9bce6ff61 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -899,7 +899,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { /* * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_commit_zap_page() can see us + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. */ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); From a101c9d63ebb294144e596bfe9b4ae3156b1be96 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Aug 2018 14:49:59 +0300 Subject: [PATCH 10/23] KVM: SVM: Switch to bitmap_zalloc() Switch to bitmap_zalloc() to show clearly what we are allocating. Besides that it returns pointer of bitmap type instead of opaque void *. Signed-off-by: Andy Shevchenko Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 89c4c5aa15f1..c7f1c3fd782d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1226,8 +1226,7 @@ static __init int sev_hardware_setup(void) min_sev_asid = cpuid_edx(0x8000001F); /* Initialize SEV ASID bitmap */ - sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), - sizeof(unsigned long), GFP_KERNEL); + sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) return 1; @@ -1405,7 +1404,7 @@ static __exit void svm_hardware_unsetup(void) int cpu; if (svm_sev_enabled()) - kfree(sev_asid_bitmap); + bitmap_free(sev_asid_bitmap); for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); From 4c008127e4716d246b44b403f8a65ae9744d32c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 Aug 2018 15:21:10 -0700 Subject: [PATCH 11/23] KVM: VMX: immediately mark preemption timer expired only for zero value A VMX preemption timer value of '0' at the time of VMEnter is architecturally guaranteed to cause a VMExit prior to the CPU executing any instructions in the guest. This architectural definition is in place to ensure that a previously expired timer is correctly recognized by the CPU as it is possible for the timer to reach zero and not trigger a VMexit due to a higher priority VMExit being signalled instead, e.g. a pending #DB that morphs into a VMExit. Whether by design or coincidence, commit f4124500c2c1 ("KVM: nVMX: Fully emulate preemption timer") special cased timer values of '0' and '1' to ensure prompt delivery of the VMExit. Unlike '0', a timer value of '1' has no has no architectural guarantees regarding when it is delivered. Modify the timer emulation to trigger immediate VMExit if and only if the timer value is '0', and document precisely why '0' is special. Do this even if calibration of the virtual TSC failed, i.e. VMExit will occur immediately regardless of the frequency of the timer. Making only '0' a special case gives KVM leeway to be more aggressive in ensuring the VMExit is injected prior to executing instructions in the nested guest, and also eliminates any ambiguity as to why '1' is a special case, e.g. why wasn't the threshold for a "short timeout" set to 10, 100, 1000, etc... Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 533a327372c8..4655d6dd6759 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11427,16 +11427,18 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); - if (vcpu->arch.virtual_tsc_khz == 0) - return; - - /* Make sure short timeouts reliably trigger an immediate vmexit. - * hrtimer_start does not guarantee this. */ - if (preemption_timeout <= 1) { + /* + * A timer value of zero is architecturally guaranteed to cause + * a VMExit prior to executing any instructions in the guest. + */ + if (preemption_timeout == 0) { vmx_preemption_timer_fn(&vmx->nested.preemption_timer); return; } + if (vcpu->arch.virtual_tsc_khz == 0) + return; + preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; preemption_timeout *= 1000000; do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); From f459a707ed313f110e4939d634317edcf9e96774 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 Aug 2018 15:21:11 -0700 Subject: [PATCH 12/23] KVM: VMX: modify preemption timer bit only when arming timer Provide a singular location where the VMX preemption timer bit is set/cleared so that future usages of the preemption timer can ensure the VMCS bit is up-to-date without having to modify unrelated code paths. For example, the preemption timer can be used to force an immediate VMExit. Cache the status of the timer to avoid redundant VMREAD and VMWRITE, e.g. if the timer stays armed across multiple VMEnters/VMExits. Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 61 ++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4655d6dd6759..62670b2f6d48 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -397,6 +397,7 @@ struct loaded_vmcs { int cpu; bool launched; bool nmi_known_unmasked; + bool hv_timer_armed; /* Support for vnmi-less CPUs */ int soft_vnmi_blocked; ktime_t entry_time; @@ -10595,24 +10596,38 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host, false); } -static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct vcpu_vmx *vmx, u32 val) +{ + vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, val); + if (!vmx->loaded_vmcs->hv_timer_armed) + vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = true; +} + +static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; u32 delta_tsc; - if (vmx->hv_deadline_tsc == -1) + if (vmx->hv_deadline_tsc != -1) { + tscl = rdtsc(); + if (vmx->hv_deadline_tsc > tscl) + /* set_hv_timer ensures the delta fits in 32-bits */ + delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> + cpu_preemption_timer_multi); + else + delta_tsc = 0; + + vmx_arm_hv_timer(vmx, delta_tsc); return; + } - tscl = rdtsc(); - if (vmx->hv_deadline_tsc > tscl) - /* sure to be 32 bit only because checked on set_hv_timer */ - delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >> - cpu_preemption_timer_multi); - else - delta_tsc = 0; - - vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc); + if (vmx->loaded_vmcs->hv_timer_armed) + vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, + PIN_BASED_VMX_PREEMPTION_TIMER); + vmx->loaded_vmcs->hv_timer_armed = false; } static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -10672,7 +10687,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) atomic_switch_perf_msrs(vmx); - vmx_arm_hv_timer(vcpu); + vmx_update_hv_timer(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -12078,11 +12093,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control = vmcs12->pin_based_vm_exec_control; - /* Preemption timer setting is only taken from vmcs01. */ - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + /* Preemption timer setting is computed directly in vmx_vcpu_run. */ exec_control |= vmcs_config.pin_based_exec_ctrl; - if (vmx->hv_deadline_tsc == -1) - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + vmx->loaded_vmcs->hv_timer_armed = false; /* Posted interrupts setting is only taken from vmcs12. */ if (nested_cpu_has_posted_intr(vmcs12)) { @@ -13255,12 +13269,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); - if (vmx->hv_deadline_tsc == -1) - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - else - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -13464,18 +13473,12 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) return -ERANGE; vmx->hv_deadline_tsc = tscl + delta_tsc; - vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); - return delta_tsc == 0; } static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->hv_deadline_tsc = -1; - vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, - PIN_BASED_VMX_PREEMPTION_TIMER); + to_vmx(vcpu)->hv_deadline_tsc = -1; } #endif From d264ee0c2ed20c6a426663590d4fc7a36cb6abd7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 Aug 2018 15:21:12 -0700 Subject: [PATCH 13/23] KVM: VMX: use preemption timer to force immediate VMExit A VMX preemption timer value of '0' is guaranteed to cause a VMExit prior to the CPU executing any instructions in the guest. Use the preemption timer (if it's supported) to trigger immediate VMExit in place of the current method of sending a self-IPI. This ensures that pending VMExit injection to L1 occurs prior to executing any instructions in the guest (regardless of nesting level). When deferring VMExit injection, KVM generates an immediate VMExit from the (possibly nested) guest by sending itself an IPI. Because hardware interrupts are blocked prior to VMEnter and are unblocked (in hardware) after VMEnter, this results in taking a VMExit(INTR) before any guest instruction is executed. But, as this approach relies on the IPI being received before VMEnter executes, it only works as intended when KVM is running as L0. Because there are no architectural guarantees regarding when IPIs are delivered, when running nested the INTR may "arrive" long after L2 is running e.g. L0 KVM doesn't force an immediate switch to L1 to deliver an INTR. For the most part, this unintended delay is not an issue since the events being injected to L1 also do not have architectural guarantees regarding their timing. The notable exception is the VMX preemption timer[1], which is architecturally guaranteed to cause a VMExit prior to executing any instructions in the guest if the timer value is '0' at VMEnter. Specifically, the delay in injecting the VMExit causes the preemption timer KVM unit test to fail when run in a nested guest. Note: this approach is viable even on CPUs with a broken preemption timer, as broken in this context only means the timer counts at the wrong rate. There are no known errata affecting timer value of '0'. [1] I/O SMIs also have guarantees on when they arrive, but I have no idea if/how those are emulated in KVM. Signed-off-by: Sean Christopherson [Use a hook for SVM instead of leaving the default in x86.c - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 2 ++ arch/x86/kvm/vmx.c | 21 ++++++++++++++++++++- arch/x86/kvm/x86.c | 8 +++++++- 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8e90488c3d56..bffb25b50425 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1055,6 +1055,7 @@ struct kvm_x86_ops { bool (*umip_emulated)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1482,6 +1483,7 @@ extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu); int kvm_is_in_guest(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c7f1c3fd782d..d96092b35936 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7148,6 +7148,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .check_intercept = svm_check_intercept, .handle_external_intr = svm_handle_external_intr, + .request_immediate_exit = __kvm_request_immediate_exit, + .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 62670b2f6d48..a4a1585f47f1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1020,6 +1020,8 @@ struct vcpu_vmx { int ple_window; bool ple_window_dirty; + bool req_immediate_exit; + /* Support for PML */ #define PML_ENTITY_NUM 512 struct page *pml_pg; @@ -2865,6 +2867,8 @@ static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) u16 fs_sel, gs_sel; int i; + vmx->req_immediate_exit = false; + if (vmx->loaded_cpu_state) return; @@ -7967,6 +7971,9 @@ static __init int hardware_setup(void) kvm_x86_ops->enable_log_dirty_pt_masked = NULL; } + if (!cpu_has_vmx_preemption_timer()) + kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) { u64 vmx_msr; @@ -9209,7 +9216,8 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) static int handle_preemption_timer(struct kvm_vcpu *vcpu) { - kvm_lapic_expired_hv_timer(vcpu); + if (!to_vmx(vcpu)->req_immediate_exit) + kvm_lapic_expired_hv_timer(vcpu); return 1; } @@ -10611,6 +10619,11 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) u64 tscl; u32 delta_tsc; + if (vmx->req_immediate_exit) { + vmx_arm_hv_timer(vmx, 0); + return; + } + if (vmx->hv_deadline_tsc != -1) { tscl = rdtsc(); if (vmx->hv_deadline_tsc > tscl) @@ -12879,6 +12892,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } +static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + to_vmx(vcpu)->req_immediate_exit = true; +} + static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) { ktime_t remaining = @@ -14135,6 +14153,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, + .request_immediate_exit = vmx_request_immediate_exit, .sched_in = vmx_sched_in, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c870203737f..9d0fda9056de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7361,6 +7361,12 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); +void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) +{ + smp_send_reschedule(vcpu->cpu); +} +EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit); + /* * Returns 1 to let vcpu_run() continue the guest execution loop without * exiting to the userspace. Otherwise, the value will be returned to the @@ -7565,7 +7571,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - smp_send_reschedule(vcpu->cpu); + kvm_x86_ops->request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); From a1efa9b70097a7ebb7c0a10bb72648776771b281 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 27 Aug 2018 18:48:57 +0200 Subject: [PATCH 14/23] x86/hyper-v: rename ipi_arg_{ex,non_ex} structures These structures are going to be used from KVM code so let's make their names reflect their Hyper-V origin. Signed-off-by: Vitaly Kuznetsov Reviewed-by: Roman Kagan Acked-by: K. Y. Srinivasan Signed-off-by: Paolo Bonzini --- arch/x86/hyperv/hv_apic.c | 8 ++++---- arch/x86/include/asm/hyperv-tlfs.h | 16 +++++++++------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5b0f613428c2..2c43e3055948 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -95,8 +95,8 @@ static void hv_apic_eoi_write(u32 reg, u32 val) */ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) { - struct ipi_arg_ex **arg; - struct ipi_arg_ex *ipi_arg; + struct hv_send_ipi_ex **arg; + struct hv_send_ipi_ex *ipi_arg; unsigned long flags; int nr_bank = 0; int ret = 1; @@ -105,7 +105,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector) return false; local_irq_save(flags); - arg = (struct ipi_arg_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); + arg = (struct hv_send_ipi_ex **)this_cpu_ptr(hyperv_pcpu_input_arg); ipi_arg = *arg; if (unlikely(!ipi_arg)) @@ -135,7 +135,7 @@ ipi_mask_ex_done: static bool __send_ipi_mask(const struct cpumask *mask, int vector) { int cur_cpu, vcpu; - struct ipi_arg_non_ex ipi_arg; + struct hv_send_ipi ipi_arg; int ret = 1; trace_hyperv_send_ipi_mask(mask, vector); diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index e977b6b3a538..00e01d215f74 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -726,19 +726,21 @@ struct hv_enlightened_vmcs { #define HV_STIMER_AUTOENABLE (1ULL << 3) #define HV_STIMER_SINT(config) (__u8)(((config) >> 16) & 0x0F) -struct ipi_arg_non_ex { - u32 vector; - u32 reserved; - u64 cpu_mask; -}; - struct hv_vpset { u64 format; u64 valid_bank_mask; u64 bank_contents[]; }; -struct ipi_arg_ex { +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +}; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { u32 vector; u32 reserved; struct hv_vpset vp_set; From 822f312d47f0200dc0999c9f006fe94aa43bd0bd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 12 Sep 2018 15:33:45 +0200 Subject: [PATCH 15/23] kvm: x86: make kvm_{load|put}_guest_fpu() static The functions kvm_load_guest_fpu() kvm_put_guest_fpu() are only used locally, make them static. This requires also that both functions are moved because they are used before their implementation. Those functions were exported (via EXPORT_SYMBOL) before commit e5bb40251a920 ("KVM: Drop kvm_{load,put}_guest_fpu() exports"). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 46 ++++++++++++++++++++-------------------- include/linux/kvm_host.h | 2 -- 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d0fda9056de..6f4789398876 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7835,6 +7835,29 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) return 0; } +/* Swap (qemu) user FPU context for the guest FPU context. */ +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); + /* PKRU is separately restored in kvm_x86_ops->run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + ~XFEATURE_MASK_PKRU); + preempt_enable(); + trace_kvm_fpu(1); +} + +/* When vcpu_run ends, restore user space FPU context. */ +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); + copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); + preempt_enable(); + ++vcpu->stat.fpu_reload; + trace_kvm_fpu(0); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -8412,29 +8435,6 @@ static void fx_init(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= X86_CR0_ET; } -/* Swap (qemu) user FPU context for the guest FPU context. */ -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, - ~XFEATURE_MASK_PKRU); - preempt_enable(); - trace_kvm_fpu(1); -} - -/* When vcpu_run ends, restore user space FPU context. */ -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); - preempt_enable(); - ++vcpu->stat.fpu_reload; - trace_kvm_fpu(0); -} - void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0205aee44ded..c926698040e0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -733,8 +733,6 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); -void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); -void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); From 5bea5123cbf08f990a1aee8f08c643a272e06a0f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 18 Sep 2018 15:19:17 +0200 Subject: [PATCH 16/23] KVM: VMX: check nested state and CR4.VMXE against SMM VMX cannot be enabled under SMM, check it when CR4 is set and when nested virtualization state is restored. This should fix some WARNs reported by syzkaller, mostly around alloc_shadow_vmcs. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a4a1585f47f1..16e63a92992f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5398,9 +5398,10 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) * To use VMXON (and later other VMX instructions), a guest * must first be able to turn on cr4.VMXE (see handle_vmon()). * So basically the check on whether to allow nested VMX - * is here. + * is here. We operate under the default treatment of SMM, + * so VMX cannot be enabled under SMM. */ - if (!nested_vmx_allowed(vcpu)) + if (!nested_vmx_allowed(vcpu) || is_smm(vcpu)) return 1; } @@ -13977,6 +13978,14 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; + /* + * SMM temporarily disables VMX, so we cannot be in guest mode, + * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags + * must be zero. + */ + if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags) + return -EINVAL; + if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) return -EINVAL; From e6c67d8cf1173b229f0c4343d1cc7925eca11c11 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Tue, 4 Sep 2018 10:56:52 +0300 Subject: [PATCH 17/23] KVM: nVMX: Wake blocked vCPU in guest-mode if pending interrupt in virtual APICv In case L1 do not intercept L2 HLT or enter L2 in HLT activity-state, it is possible for a vCPU to be blocked while it is in guest-mode. According to Intel SDM 26.6.5 Interrupt-Window Exiting and Virtual-Interrupt Delivery: "These events wake the logical processor if it just entered the HLT state because of a VM entry". Therefore, if L1 enters L2 in HLT activity-state and L2 has a pending deliverable interrupt in vmcs12->guest_intr_status.RVI, then the vCPU should be waken from the HLT state and injected with the interrupt. In addition, if while the vCPU is blocked (while it is in guest-mode), it receives a nested posted-interrupt, then the vCPU should also be waken and injected with the posted interrupt. To handle these cases, this patch enhances kvm_vcpu_has_events() to also check if there is a pending interrupt in L2 virtual APICv provided by L1. That is, it evaluates if there is a pending virtual interrupt for L2 by checking RVI[7:4] > VPPR[7:4] as specified in Intel SDM 29.2.1 Evaluation of Pending Interrupts. Note that this also handles the case of nested posted-interrupt by the fact RVI is updated in vmx_complete_nested_posted_interrupt() which is called from kvm_vcpu_check_block() -> kvm_arch_vcpu_runnable() -> kvm_vcpu_running() -> vmx_check_nested_events() -> vmx_complete_nested_posted_interrupt(). Reviewed-by: Nikita Leshenko Reviewed-by: Darren Kenny Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 22 ++++++++++++++++++++++ arch/x86/kvm/x86.c | 10 +++++++++- 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bffb25b50425..af63c2ca1616 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1022,6 +1022,7 @@ struct kvm_x86_ops { void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); + bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 16e63a92992f..98b1203e8823 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6189,6 +6189,27 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) nested_mark_vmcs12_pages_dirty(vcpu); } +static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + void *vapic_page; + u32 vppr; + int rvi; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu)) || + !nested_cpu_has_vid(get_vmcs12(vcpu)) || + WARN_ON_ONCE(!vmx->nested.virtual_apic_page)) + return false; + + rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff; + + vapic_page = kmap(vmx->nested.virtual_apic_page); + vppr = *((u32 *)(vapic_page + APIC_PROCPRI)); + kunmap(vmx->nested.virtual_apic_page); + + return ((rvi & 0xf0) > (vppr & 0xf0)); +} + static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, bool nested) { @@ -14129,6 +14150,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .apicv_post_state_restore = vmx_apicv_post_state_restore, .hwapic_irr_update = vmx_hwapic_irr_update, .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, .deliver_posted_interrupt = vmx_deliver_posted_interrupt, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f4789398876..5fea53cdc583 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9206,6 +9206,13 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, kvm_page_track_flush_slot(kvm, slot); } +static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) +{ + return (is_guest_mode(vcpu) && + kvm_x86_ops->guest_apic_has_interrupt && + kvm_x86_ops->guest_apic_has_interrupt(vcpu)); +} + static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) @@ -9230,7 +9237,8 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return true; if (kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) + (kvm_cpu_has_interrupt(vcpu) || + kvm_guest_apic_has_interrupt(vcpu))) return true; if (kvm_hv_has_stimer_pending(vcpu)) From 6de84e581c083c4357b45c31b7ef71335725d850 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Thu, 23 Aug 2018 20:03:03 -0400 Subject: [PATCH 18/23] nVMX x86: check posted-interrupt descriptor addresss on vmentry of L2 According to section "Checks on VMX Controls" in Intel SDM vol 3C, the following check needs to be enforced on vmentry of L2 guests: - Bits 5:0 of the posted-interrupt descriptor address are all 0. - The posted-interrupt descriptor address does not set any bits beyond the processor's physical-address width. Signed-off-by: Krish Sadhukhan Reviewed-by: Mark Kanda Reviewed-by: Liran Alon Reviewed-by: Darren Kenny Reviewed-by: Karl Heubaum Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 98b1203e8823..581bdbd9844b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11698,11 +11698,15 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, * bits 15:8 should be zero in posted_intr_nv, * the descriptor address has been already checked * in nested_get_vmcs12_pages. + * + * bits 5:0 of posted_intr_desc_addr should be zero. */ if (nested_cpu_has_posted_intr(vmcs12) && (!nested_cpu_has_vid(vmcs12) || !nested_exit_intr_ack_set(vcpu) || - vmcs12->posted_intr_nv & 0xff00)) + (vmcs12->posted_intr_nv & 0xff00) || + (vmcs12->posted_intr_desc_addr & 0x3f) || + (!page_address_valid(vcpu, vmcs12->posted_intr_desc_addr)))) return -EINVAL; /* tpr shadow is needed by all apicv features. */ From ba8e23db59dc07e5de74fd7bd310e297d3e4ba54 Mon Sep 17 00:00:00 2001 From: Krish Sadhukhan Date: Tue, 4 Sep 2018 14:42:58 -0400 Subject: [PATCH 19/23] nVMX x86: Check VPID value on vmentry of L2 guests According to section "Checks on VMX Controls" in Intel SDM vol 3C, the following check needs to be enforced on vmentry of L2 guests: If the 'enable VPID' VM-execution control is 1, the value of the of the VPID VM-execution control field must not be 0000H. Signed-off-by: Krish Sadhukhan Reviewed-by: Mark Kanda Reviewed-by: Liran Alon Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 581bdbd9844b..06412ba46aa3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -12373,6 +12373,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id) + return VMXERR_ENTRY_INVALID_CONTROL_FIELD; + if (nested_vmx_check_io_bitmap_controls(vcpu, vmcs12)) return VMXERR_ENTRY_INVALID_CONTROL_FIELD; From d84f1cff9028c00ee870f0293b0c7a3866071dfa Mon Sep 17 00:00:00 2001 From: Drew Schmitt Date: Mon, 20 Aug 2018 10:32:14 -0700 Subject: [PATCH 20/23] KVM: x86: Turbo bits in MSR_PLATFORM_INFO Allow userspace to set turbo bits in MSR_PLATFORM_INFO. Previously, only the CPUID faulting bit was settable. But now any bit in MSR_PLATFORM_INFO would be settable. This can be used, for example, to convey frequency information about the platform on which the guest is running. Signed-off-by: Drew Schmitt Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5fea53cdc583..e127703e277e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2537,7 +2537,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_PLATFORM_INFO: if (!msr_info->host_initiated || - data & ~MSR_PLATFORM_INFO_CPUID_FAULT || (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && cpuid_fault_enabled(vcpu))) return 1; From 6fbbde9a1969dfb476467ebf69a475095ef3fd4d Mon Sep 17 00:00:00 2001 From: Drew Schmitt Date: Mon, 20 Aug 2018 10:32:15 -0700 Subject: [PATCH 21/23] KVM: x86: Control guest reads of MSR_PLATFORM_INFO Add KVM_CAP_MSR_PLATFORM_INFO so that userspace can disable guest access to reads of MSR_PLATFORM_INFO. Disabling access to reads of this MSR gives userspace the control to "expose" this platform-dependent information to guests in a clear way. As it exists today, guests that read this MSR would get unpopulated information if userspace hadn't already set it (and prior to this patch series, only the CPUID faulting information could have been populated). This existing interface could be confusing if guests don't handle the potential for incorrect/incomplete information gracefully (e.g. zero reported for base frequency). Signed-off-by: Drew Schmitt Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 9 +++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 10 ++++++++++ include/uapi/linux/kvm.h | 1 + 4 files changed, 22 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 8d8a372c8340..647f94128a85 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4522,6 +4522,15 @@ hpage module parameter is not set to 1, -EINVAL is returned. While it is generally possible to create a huge page backed VM without this capability, the VM will not be able to run. +7.14 KVM_CAP_MSR_PLATFORM_INFO + +Architectures: x86 +Parameters: args[0] whether feature should be enabled or not + +With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, +a #GP would be raised when the guest tries to access. Currently, this +capability does not enable write permissions of this MSR for the guest. + 8. Other capabilities. ---------------------- diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af63c2ca1616..09b2e3e2cf1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -869,6 +869,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + bool guest_can_read_msr_platform_info; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e127703e277e..4c39ec5fc4fe 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2779,6 +2779,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.osvw.status; break; case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated && + !vcpu->kvm->arch.guest_can_read_msr_platform_info) + return 1; msr_info->data = vcpu->arch.msr_platform_info; break; case MSR_MISC_FEATURES_ENABLES: @@ -2926,6 +2929,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: case KVM_CAP_GET_MSR_FEATURES: + case KVM_CAP_MSR_PLATFORM_INFO: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4349,6 +4353,10 @@ split_irqchip_unlock: kvm->arch.pause_in_guest = true; r = 0; break; + case KVM_CAP_MSR_PLATFORM_INFO: + kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; + r = 0; + break; default: r = -EINVAL; break; @@ -8857,6 +8865,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); pvclock_update_vm_gtod_copy(kvm); + kvm->arch.guest_can_read_msr_platform_info = true; + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 07548de5c988..251be353f950 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -952,6 +952,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_HPAGE_1M 156 #define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158 +#define KVM_CAP_MSR_PLATFORM_INFO 159 #ifdef KVM_CAP_IRQ_ROUTING From 8b56ee91ffc88ea01400c012e10fe22a9d233265 Mon Sep 17 00:00:00 2001 From: Drew Schmitt Date: Mon, 20 Aug 2018 10:32:16 -0700 Subject: [PATCH 22/23] kvm: selftests: Add platform_info_test Test guest access to MSR_PLATFORM_INFO when the capability is enabled or disabled. Signed-off-by: Drew Schmitt Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/.gitignore | 1 + tools/testing/selftests/kvm/Makefile | 3 +- .../testing/selftests/kvm/include/kvm_util.h | 4 + tools/testing/selftests/kvm/lib/kvm_util.c | 89 ++++++++++++++ .../selftests/kvm/platform_info_test.c | 110 ++++++++++++++++++ 5 files changed, 206 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/kvm/platform_info_test.c diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 4202139d81d9..5c34752e1cff 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,4 +1,5 @@ cr4_cpuid_sync_test +platform_info_test set_sregs_test sync_regs_test vmx_tsc_adjust_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 48c970c90353..37e4bd8619a6 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -6,7 +6,8 @@ UNAME_M := $(shell uname -m) LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c LIBKVM_x86_64 = lib/x86.c lib/vmx.c -TEST_GEN_PROGS_x86_64 = set_sregs_test +TEST_GEN_PROGS_x86_64 = platform_info_test +TEST_GEN_PROGS_x86_64 += set_sregs_test TEST_GEN_PROGS_x86_64 += sync_regs_test TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index bb5a25fb82c6..3acf9a91704c 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -50,6 +50,7 @@ enum vm_mem_backing_src_type { }; int kvm_check_cap(long cap); +int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm); void kvm_vm_free(struct kvm_vm *vmp); @@ -108,6 +109,9 @@ void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events); void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events); +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index); +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value); const char *exit_reason_str(unsigned int exit_reason); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e9ba389c48db..6fd8c089cafc 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -63,6 +63,29 @@ int kvm_check_cap(long cap) return ret; } +/* VM Enable Capability + * + * Input Args: + * vm - Virtual Machine + * cap - Capability + * + * Output Args: None + * + * Return: On success, 0. On failure a TEST_ASSERT failure is produced. + * + * Enables a capability (KVM_CAP_*) on the VM. + */ +int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap) +{ + int ret; + + ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap); + TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n" + " rc: %i errno: %i", ret, errno); + + return ret; +} + static void vm_open(struct kvm_vm *vm, int perm) { vm->kvm_fd = open(KVM_DEV_PATH, perm); @@ -1220,6 +1243,72 @@ void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, ret, errno); } +/* VCPU Get MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * + * Output Args: None + * + * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced. + * + * Get value of MSR for VCPU. + */ +uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); + + return buffer.entry.data; +} + +/* VCPU Set MSR + * + * Input Args: + * vm - Virtual Machine + * vcpuid - VCPU ID + * msr_index - Index of MSR + * msr_value - New value of MSR + * + * Output Args: None + * + * Return: On success, nothing. On failure a TEST_ASSERT is produced. + * + * Set value of MSR for VCPU. + */ +void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index, + uint64_t msr_value) +{ + struct vcpu *vcpu = vcpu_find(vm, vcpuid); + struct { + struct kvm_msrs header; + struct kvm_msr_entry entry; + } buffer = {}; + int r; + + TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid); + memset(&buffer, 0, sizeof(buffer)); + buffer.header.nmsrs = 1; + buffer.entry.index = msr_index; + buffer.entry.data = msr_value; + r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header); + TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n" + " rc: %i errno: %i", r, errno); +} + /* VM VCPU Args Set * * Input Args: diff --git a/tools/testing/selftests/kvm/platform_info_test.c b/tools/testing/selftests/kvm/platform_info_test.c new file mode 100644 index 000000000000..3764e7121265 --- /dev/null +++ b/tools/testing/selftests/kvm/platform_info_test.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for x86 KVM_CAP_MSR_PLATFORM_INFO + * + * Copyright (C) 2018, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Verifies expected behavior of controlling guest access to + * MSR_PLATFORM_INFO. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "x86.h" + +#define VCPU_ID 0 +#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00 + +static void guest_code(void) +{ + uint64_t msr_platform_info; + + for (;;) { + msr_platform_info = rdmsr(MSR_PLATFORM_INFO); + GUEST_SYNC(msr_platform_info); + asm volatile ("inc %r11"); + } +} + +static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable) +{ + struct kvm_enable_cap cap = {}; + + cap.cap = KVM_CAP_MSR_PLATFORM_INFO; + cap.flags = 0; + cap.args[0] = (int)enable; + vm_enable_cap(vm, &cap); +} + +static void test_msr_platform_info_enabled(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + struct guest_args args; + + set_msr_platform_info_enabled(vm, true); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Exit_reason other than KVM_EXIT_IO: %u (%s),\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + guest_args_read(vm, VCPU_ID, &args); + TEST_ASSERT(args.port == GUEST_PORT_SYNC, + "Received IO from port other than PORT_HOST_SYNC: %u\n", + run->io.port); + TEST_ASSERT((args.arg1 & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) == + MSR_PLATFORM_INFO_MAX_TURBO_RATIO, + "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.", + MSR_PLATFORM_INFO_MAX_TURBO_RATIO); +} + +static void test_msr_platform_info_disabled(struct kvm_vm *vm) +{ + struct kvm_run *run = vcpu_state(vm, VCPU_ID); + + set_msr_platform_info_enabled(vm, false); + vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN, + "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_run *state; + int rv; + uint64_t msr_platform_info; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO); + if (!rv) { + fprintf(stderr, + "KVM_CAP_MSR_PLATFORM_INFO not supported, skip test\n"); + exit(KSFT_SKIP); + } + + vm = vm_create_default(VCPU_ID, 0, guest_code); + + msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO); + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, + msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO); + test_msr_platform_info_disabled(vm); + test_msr_platform_info_enabled(vm); + vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info); + + kvm_vm_free(vm); + + return 0; +} From 26b471c7e2f7befd0f59c35b257749ca57e0ed70 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 16 Sep 2018 14:28:20 +0300 Subject: [PATCH 23/23] KVM: nVMX: Fix bad cleanup on error of get/set nested state IOCTLs The handlers of IOCTLs in kvm_arch_vcpu_ioctl() are expected to set their return value in "r" local var and break out of switch block when they encounter some error. This is because vcpu_load() is called before the switch block which have a proper cleanup of vcpu_put() afterwards. However, KVM_{GET,SET}_NESTED_STATE IOCTLs handlers just return immediately on error without performing above mentioned cleanup. Thus, change these handlers to behave as expected. Fixes: 8fcc4b5923af ("kvm: nVMX: Introduce KVM_CAP_NESTED_STATE") Reviewed-by: Mark Kanda Reviewed-by: Patrick Colp Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c39ec5fc4fe..edbf00ec56b3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4010,19 +4010,23 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); + r = -EFAULT; if (get_user(user_data_size, &user_kvm_nested_state->size)) - return -EFAULT; + break; r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) - return r; + break; if (r > user_data_size) { if (put_user(r, &user_kvm_nested_state->size)) - return -EFAULT; - return -E2BIG; + r = -EFAULT; + else + r = -E2BIG; + break; } + r = 0; break; } @@ -4034,19 +4038,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!kvm_x86_ops->set_nested_state) break; + r = -EFAULT; if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state))) - return -EFAULT; + break; + r = -EINVAL; if (kvm_state.size < sizeof(kvm_state)) - return -EINVAL; + break; if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) - return -EINVAL; + break; /* nested_run_pending implies guest_mode. */ if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) - return -EINVAL; + break; r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); break;