More x86 fixes:

* Logic bugs in CR0 writes and Hyper-V hypercalls
 * Don't use Enlightened MSR Bitmap for L3
 * Remove user-triggerable WARN
 
 Plus a few selftest fixes and a regression test for the
 user-triggerable WARN.
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmGzZuIUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroMD4Qf+Im7Q0XNRZGzK6x4Blu3ZZJSuIIkW
 gEK5mDX/BWSYxoGhRN0IOkyf1Tx/A5qYwbZts87wZSvKONG2MuVzdeQ0mkDxgKc3
 cYwvvIPxCKaW/dQLD2OKVlqdAv6YbeJiFURWXgszMkrcgHvw39H5Tn6ldi0B5nvg
 Gvpj8LtbPDXGXab//Xrhia3+1F9TKOrcOG+obGC5G2mrGKTkG2+pi9L6LohvENhd
 sOSWdpmvQTU4PeqGlhW8RCwcN+vpa+NasHT2i2tHcWZA9Lqp4P81+4ZyQLIBsRB3
 psANG0c40EW+lfjFGbLL/6VR5kypxa6zy9RgX+QiRcj6C0+dJOgnwNY35A==
 =cREz
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "More x86 fixes:

   - Logic bugs in CR0 writes and Hyper-V hypercalls

   - Don't use Enlightened MSR Bitmap for L3

   - Remove user-triggerable WARN

  Plus a few selftest fixes and a regression test for the
  user-triggerable WARN"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  selftests: KVM: Add test to verify KVM doesn't explode on "bad" I/O
  KVM: x86: Don't WARN if userspace mucks with RCX during string I/O exit
  KVM: X86: Raise #GP when clearing CR0_PG in 64 bit mode
  selftests: KVM: avoid failures due to reserved HyperTransport region
  KVM: x86: Ignore sparse banks size for an "all CPUs", non-sparse IPI req
  KVM: x86: Wait for IPIs to be delivered when handling Hyper-V TLB flush hypercall
  KVM: x86: selftests: svm_int_ctl_test: fix intercept calculation
  KVM: nVMX: Don't use Enlightened MSR Bitmap for L3
This commit is contained in:
Linus Torvalds 2021-12-10 14:09:12 -08:00
commit b9172f9e88
11 changed files with 223 additions and 17 deletions

View file

@ -97,7 +97,7 @@
KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26)
#define KVM_REQ_TLB_FLUSH_GUEST \
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP)
KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_APF_READY KVM_ARCH_REQ(28)
#define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29)
#define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \

View file

@ -1922,11 +1922,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
all_cpus = send_ipi_ex.vp_set.format == HV_GENERIC_SET_ALL;
if (all_cpus)
goto check_and_send_ipi;
if (!sparse_banks_len)
goto ret_success;
if (!all_cpus &&
kvm_read_guest(kvm,
if (kvm_read_guest(kvm,
hc->ingpa + offsetof(struct hv_send_ipi_ex,
vp_set.bank_contents),
sparse_banks,
@ -1934,6 +1936,7 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc, bool
return HV_STATUS_INVALID_HYPERCALL_INPUT;
}
check_and_send_ipi:
if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR))
return HV_STATUS_INVALID_HYPERCALL_INPUT;

View file

@ -2646,15 +2646,6 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
if (!loaded_vmcs->msr_bitmap)
goto out_vmcs;
memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
if (IS_ENABLED(CONFIG_HYPERV) &&
static_branch_unlikely(&enable_evmcs) &&
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
struct hv_enlightened_vmcs *evmcs =
(struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
}
memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
@ -6842,6 +6833,19 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
if (err < 0)
goto free_pml;
/*
* Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
* nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
* feature only for vmcs01, KVM currently isn't equipped to realize any
* performance benefits from enabling it for vmcs02.
*/
if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
evmcs->hv_enlightenments_control.msr_bitmap = 1;
}
/* The MSR bitmap starts with all ones */
bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);

View file

@ -890,7 +890,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
return 1;
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
if (!(cr0 & X86_CR0_PG) &&
(is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
return 1;
static_call(kvm_x86_set_cr0)(vcpu, cr0);
@ -7121,7 +7122,13 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
unsigned short port, void *val, unsigned int count)
{
if (vcpu->arch.pio.count) {
/* Complete previous iteration. */
/*
* Complete a previous iteration that required userspace I/O.
* Note, @count isn't guaranteed to match pio.count as userspace
* can modify ECX before rerunning the vCPU. Ignore any such
* shenanigans as KVM doesn't support modifying the rep count,
* and the emulator ensures @count doesn't overflow the buffer.
*/
} else {
int r = __emulator_pio_in(vcpu, size, port, count);
if (!r)
@ -7130,7 +7137,6 @@ static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
/* Results already available, fall through. */
}
WARN_ON(count != vcpu->arch.pio.count);
complete_emulator_pio_in(vcpu, val);
return 1;
}

View file

@ -30,6 +30,7 @@
/x86_64/svm_int_ctl_test
/x86_64/sync_regs_test
/x86_64/tsc_msrs_test
/x86_64/userspace_io_test
/x86_64/userspace_msr_exit_test
/x86_64/vmx_apic_access_test
/x86_64/vmx_close_while_nested_test

View file

@ -59,6 +59,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
TEST_GEN_PROGS_x86_64 += x86_64/svm_int_ctl_test
TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
TEST_GEN_PROGS_x86_64 += x86_64/userspace_io_test
TEST_GEN_PROGS_x86_64 += x86_64/userspace_msr_exit_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test

View file

@ -71,6 +71,15 @@ enum vm_guest_mode {
#endif
#if defined(__x86_64__)
unsigned long vm_compute_max_gfn(struct kvm_vm *vm);
#else
static inline unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
{
return ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
}
#endif
#define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT)
#define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE)

View file

@ -302,7 +302,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
(1ULL << (vm->va_bits - 1)) >> vm->page_shift);
/* Limit physical addresses to PA-bits. */
vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
vm->max_gfn = vm_compute_max_gfn(vm);
/* Allocate and setup memory for guest. */
vm->vpages_mapped = sparsebit_alloc();

View file

@ -1431,3 +1431,71 @@ struct kvm_cpuid2 *vcpu_get_supported_hv_cpuid(struct kvm_vm *vm, uint32_t vcpui
return cpuid;
}
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163
#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65
static inline unsigned x86_family(unsigned int eax)
{
unsigned int x86;
x86 = (eax >> 8) & 0xf;
if (x86 == 0xf)
x86 += (eax >> 20) & 0xff;
return x86;
}
unsigned long vm_compute_max_gfn(struct kvm_vm *vm)
{
const unsigned long num_ht_pages = 12 << (30 - vm->page_shift); /* 12 GiB */
unsigned long ht_gfn, max_gfn, max_pfn;
uint32_t eax, ebx, ecx, edx, max_ext_leaf;
max_gfn = (1ULL << (vm->pa_bits - vm->page_shift)) - 1;
/* Avoid reserved HyperTransport region on AMD processors. */
eax = ecx = 0;
cpuid(&eax, &ebx, &ecx, &edx);
if (ebx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx ||
ecx != X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx ||
edx != X86EMUL_CPUID_VENDOR_AuthenticAMD_edx)
return max_gfn;
/* On parts with <40 physical address bits, the area is fully hidden */
if (vm->pa_bits < 40)
return max_gfn;
/* Before family 17h, the HyperTransport area is just below 1T. */
ht_gfn = (1 << 28) - num_ht_pages;
eax = 1;
cpuid(&eax, &ebx, &ecx, &edx);
if (x86_family(eax) < 0x17)
goto done;
/*
* Otherwise it's at the top of the physical address space, possibly
* reduced due to SME by bits 11:6 of CPUID[0x8000001f].EBX. Use
* the old conservative value if MAXPHYADDR is not enumerated.
*/
eax = 0x80000000;
cpuid(&eax, &ebx, &ecx, &edx);
max_ext_leaf = eax;
if (max_ext_leaf < 0x80000008)
goto done;
eax = 0x80000008;
cpuid(&eax, &ebx, &ecx, &edx);
max_pfn = (1ULL << ((eax & 0xff) - vm->page_shift)) - 1;
if (max_ext_leaf >= 0x8000001f) {
eax = 0x8000001f;
cpuid(&eax, &ebx, &ecx, &edx);
max_pfn >>= (ebx >> 6) & 0x3f;
}
ht_gfn = max_pfn - num_ht_pages;
done:
return min(max_gfn, ht_gfn - 1);
}

View file

@ -75,7 +75,7 @@ static void l1_guest_code(struct svm_test_data *svm)
vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
/* No intercepts for real and virtual interrupts */
vmcb->control.intercept &= ~(1ULL << INTERCEPT_INTR | INTERCEPT_VINTR);
vmcb->control.intercept &= ~(BIT(INTERCEPT_INTR) | BIT(INTERCEPT_VINTR));
/* Make a virtual interrupt VINTR_IRQ_NUMBER pending */
vmcb->control.int_ctl |= V_IRQ_MASK | (0x1 << V_INTR_PRIO_SHIFT);

View file

@ -0,0 +1,114 @@
// SPDX-License-Identifier: GPL-2.0
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include "test_util.h"
#include "kvm_util.h"
#include "processor.h"
#define VCPU_ID 1
static void guest_ins_port80(uint8_t *buffer, unsigned int count)
{
unsigned long end;
if (count == 2)
end = (unsigned long)buffer + 1;
else
end = (unsigned long)buffer + 8192;
asm volatile("cld; rep; insb" : "+D"(buffer), "+c"(count) : "d"(0x80) : "memory");
GUEST_ASSERT_1(count == 0, count);
GUEST_ASSERT_2((unsigned long)buffer == end, buffer, end);
}
static void guest_code(void)
{
uint8_t buffer[8192];
int i;
/*
* Special case tests. main() will adjust RCX 2 => 1 and 3 => 8192 to
* test that KVM doesn't explode when userspace modifies the "count" on
* a userspace I/O exit. KVM isn't required to play nice with the I/O
* itself as KVM doesn't support manipulating the count, it just needs
* to not explode or overflow a buffer.
*/
guest_ins_port80(buffer, 2);
guest_ins_port80(buffer, 3);
/* Verify KVM fills the buffer correctly when not stuffing RCX. */
memset(buffer, 0, sizeof(buffer));
guest_ins_port80(buffer, 8192);
for (i = 0; i < 8192; i++)
GUEST_ASSERT_2(buffer[i] == 0xaa, i, buffer[i]);
GUEST_DONE();
}
int main(int argc, char *argv[])
{
struct kvm_regs regs;
struct kvm_run *run;
struct kvm_vm *vm;
struct ucall uc;
int rc;
/* Tell stdout not to buffer its content */
setbuf(stdout, NULL);
/* Create VM */
vm = vm_create_default(VCPU_ID, 0, guest_code);
run = vcpu_state(vm, VCPU_ID);
memset(&regs, 0, sizeof(regs));
while (1) {
rc = _vcpu_run(vm, VCPU_ID);
TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
"Unexpected exit reason: %u (%s),\n",
run->exit_reason,
exit_reason_str(run->exit_reason));
if (get_ucall(vm, VCPU_ID, &uc))
break;
TEST_ASSERT(run->io.port == 0x80,
"Expected I/O at port 0x80, got port 0x%x\n", run->io.port);
/*
* Modify the rep string count in RCX: 2 => 1 and 3 => 8192.
* Note, this abuses KVM's batching of rep string I/O to avoid
* getting stuck in an infinite loop. That behavior isn't in
* scope from a testing perspective as it's not ABI in any way,
* i.e. it really is abusing internal KVM knowledge.
*/
vcpu_regs_get(vm, VCPU_ID, &regs);
if (regs.rcx == 2)
regs.rcx = 1;
if (regs.rcx == 3)
regs.rcx = 8192;
memset((void *)run + run->io.data_offset, 0xaa, 4096);
vcpu_regs_set(vm, VCPU_ID, &regs);
}
switch (uc.cmd) {
case UCALL_DONE:
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld : argN+1 = 0x%lx, argN+2 = 0x%lx",
(const char *)uc.args[0], __FILE__, uc.args[1],
uc.args[2], uc.args[3]);
default:
TEST_FAIL("Unknown ucall %lu", uc.cmd);
}
kvm_vm_free(vm);
return 0;
}