From 3b5c082bbfa20d9a57924edd655bbe63fe98ab06 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 7 Oct 2022 23:41:50 +0000 Subject: [PATCH 01/13] KVM: arm64: Work out supported block level at compile time Work out the minimum page table level where KVM supports block mappings at compile time. While at it, rewrite the comment around supported block mappings to directly describe what KVM supports instead of phrasing in terms of what it does not. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221007234151.461779-2-oliver.upton@linux.dev --- arch/arm64/include/asm/kvm_pgtable.h | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 1b098bd4cd37..3252eb50ecfe 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -13,6 +13,18 @@ #define KVM_PGTABLE_MAX_LEVELS 4U +/* + * The largest supported block sizes for KVM (no 52-bit PA support): + * - 4K (level 1): 1GB + * - 16K (level 2): 32MB + * - 64K (level 2): 512MB + */ +#ifdef CONFIG_ARM64_4K_PAGES +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 1U +#else +#define KVM_PGTABLE_MIN_BLOCK_LEVEL 2U +#endif + static inline u64 kvm_get_parange(u64 mmfr0) { u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, @@ -58,11 +70,7 @@ static inline u64 kvm_granule_size(u32 level) static inline bool kvm_level_supports_block_mapping(u32 level) { - /* - * Reject invalid block mappings and don't bother with 4TB mappings for - * 52-bit PAs. - */ - return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); + return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } /** From 5994bc9e05c2f8811f233aa434e391cd2783f0f5 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 7 Oct 2022 23:41:51 +0000 Subject: [PATCH 02/13] KVM: arm64: Limit stage2_apply_range() batch size to largest block Presently stage2_apply_range() works on a batch of memory addressed by a stage 2 root table entry for the VM. Depending on the IPA limit of the VM and PAGE_SIZE of the host, this could address a massive range of memory. Some examples: 4 level, 4K paging -> 512 GB batch size 3 level, 64K paging -> 4TB batch size Unsurprisingly, working on such a large range of memory can lead to soft lockups. When running dirty_log_perf_test: ./dirty_log_perf_test -m -2 -s anonymous_thp -b 4G -v 48 watchdog: BUG: soft lockup - CPU#0 stuck for 45s! [dirty_log_perf_:16703] Modules linked in: vfat fat cdc_ether usbnet mii xhci_pci xhci_hcd sha3_generic gq(O) CPU: 0 PID: 16703 Comm: dirty_log_perf_ Tainted: G O 6.0.0-smp-DEV #1 pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : dcache_clean_inval_poc+0x24/0x38 lr : clean_dcache_guest_page+0x28/0x4c sp : ffff800021763990 pmr_save: 000000e0 x29: ffff800021763990 x28: 0000000000000005 x27: 0000000000000de0 x26: 0000000000000001 x25: 00400830b13bc77f x24: ffffad4f91ead9c0 x23: 0000000000000000 x22: ffff8000082ad9c8 x21: 0000fffafa7bc000 x20: ffffad4f9066ce50 x19: 0000000000000003 x18: ffffad4f92402000 x17: 000000000000011b x16: 000000000000011b x15: 0000000000000124 x14: ffff07ff8301d280 x13: 0000000000000000 x12: 00000000ffffffff x11: 0000000000010001 x10: fffffc0000000000 x9 : ffffad4f9069e580 x8 : 000000000000000c x7 : 0000000000000000 x6 : 000000000000003f x5 : ffff07ffa2076980 x4 : 0000000000000001 x3 : 000000000000003f x2 : 0000000000000040 x1 : ffff0830313bd000 x0 : ffff0830313bcc40 Call trace: dcache_clean_inval_poc+0x24/0x38 stage2_unmap_walker+0x138/0x1ec __kvm_pgtable_walk+0x130/0x1d4 __kvm_pgtable_walk+0x170/0x1d4 __kvm_pgtable_walk+0x170/0x1d4 __kvm_pgtable_walk+0x170/0x1d4 kvm_pgtable_stage2_unmap+0xc4/0xf8 kvm_arch_flush_shadow_memslot+0xa4/0x10c kvm_set_memslot+0xb8/0x454 __kvm_set_memory_region+0x194/0x244 kvm_vm_ioctl_set_memory_region+0x58/0x7c kvm_vm_ioctl+0x49c/0x560 __arm64_sys_ioctl+0x9c/0xd4 invoke_syscall+0x4c/0x124 el0_svc_common+0xc8/0x194 do_el0_svc+0x38/0xc0 el0_svc+0x2c/0xa4 el0t_64_sync_handler+0x84/0xf0 el0t_64_sync+0x1a0/0x1a4 Use the largest supported block mapping for the configured page size as the batch granularity. In so doing the walker is guaranteed to visit a leaf only once. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221007234151.461779-3-oliver.upton@linux.dev --- arch/arm64/include/asm/stage2_pgtable.h | 20 -------------------- arch/arm64/kvm/mmu.c | 9 ++++++++- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index fe341a6578c3..c8dca8ae359c 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -10,13 +10,6 @@ #include -/* - * PGDIR_SHIFT determines the size a top-level page table entry can map - * and depends on the number of levels in the page table. Compute the - * PGDIR_SHIFT for a given number of levels. - */ -#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls)) - /* * The hardware supports concatenation of up to 16 tables at stage2 entry * level and we use the feature whenever possible, which means we resolve 4 @@ -30,11 +23,6 @@ #define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4) #define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr) -/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */ -#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) -#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm)) -#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1) - /* * kvm_mmmu_cache_min_pages() is the number of pages required to install * a stage-2 translation. We pre-allocate the entry level page table at @@ -42,12 +30,4 @@ */ #define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) -static inline phys_addr_t -stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); - - return (boundary - 1 < end - 1) ? boundary : end; -} - #endif /* __ARM64_S2_PGTABLE_H_ */ diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c9a13e487187..caf6cfeff35b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -31,6 +31,13 @@ static phys_addr_t hyp_idmap_vector; static unsigned long io_map_base; +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + phys_addr_t boundary = ALIGN_DOWN(addr + size, size); + + return (boundary - 1 < end - 1) ? boundary : end; +} /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, @@ -52,7 +59,7 @@ static int stage2_apply_range(struct kvm *kvm, phys_addr_t addr, if (!pgt) return -EINVAL; - next = stage2_pgd_addr_end(kvm, addr, end); + next = stage2_range_addr_end(addr, end); ret = fn(pgt, addr, next - addr); if (ret) break; From 837d632a383f13df7a67207a196d6eb4aeb4adca Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 4 Oct 2022 16:42:16 +0100 Subject: [PATCH 03/13] KVM: arm64: Enable stack protection and branch profiling for VHE For historical reasons, the VHE code inherited the build configuration from nVHE. Now those two parts have their own folder and makefile, we can enable stack protection and branch profiling for VHE. Signed-off-by: Vincent Donnefort Reviewed-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221004154216.2833636-1-vdonnefort@google.com --- arch/arm64/kvm/hyp/Makefile | 5 +---- arch/arm64/kvm/hyp/nvhe/Makefile | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile index 687598e41b21..a38dea6186c9 100644 --- a/arch/arm64/kvm/hyp/Makefile +++ b/arch/arm64/kvm/hyp/Makefile @@ -5,9 +5,6 @@ incdir := $(srctree)/$(src)/include subdir-asflags-y := -I$(incdir) -subdir-ccflags-y := -I$(incdir) \ - -fno-stack-protector \ - -DDISABLE_BRANCH_PROFILING \ - $(DISABLE_STACKLEAK_PLUGIN) +subdir-ccflags-y := -I$(incdir) obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index b5c5119c7396..48f6ae7cc6e6 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -10,6 +10,9 @@ asflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS # will explode instantly (Words of Marc Zyngier). So introduce a generic flag # __DISABLE_TRACE_MMIO__ to disable MMIO tracing for nVHE KVM. ccflags-y := -D__KVM_NVHE_HYPERVISOR__ -D__DISABLE_EXPORTS -D__DISABLE_TRACE_MMIO__ +ccflags-y += -fno-stack-protector \ + -DDISABLE_BRANCH_PROFILING \ + $(DISABLE_STACKLEAK_PLUGIN) hostprogs := gen-hyprel HOST_EXTRACFLAGS += -I$(objtree)/include From 8a6ffcbe26fd14d58075dcf3cbdf1b5b69b20402 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sun, 9 Oct 2022 11:31:31 +0800 Subject: [PATCH 04/13] KVM: arm64: selftests: Fix multiple versions of GIC creation Commit 98f94ce42ac6 ("KVM: selftests: Move KVM_CREATE_DEVICE_TEST code to separate helper") wrongly converted a "real" GIC device creation to __kvm_test_create_device() and caused the test failure on my D05 (which supports v2 emulation). Fix it. Fixes: 98f94ce42ac6 ("KVM: selftests: Move KVM_CREATE_DEVICE_TEST code to separate helper") Signed-off-by: Zenghui Yu Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221009033131.365-1-yuzenghui@huawei.com --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index e05ecb31823f..9c131d977a1b 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -662,8 +662,8 @@ int test_kvm_device(uint32_t gic_dev_type) : KVM_DEV_TYPE_ARM_VGIC_V2; if (!__kvm_test_create_device(v.vm, other)) { - ret = __kvm_test_create_device(v.vm, other); - TEST_ASSERT(ret && (errno == EINVAL || errno == EEXIST), + ret = __kvm_create_device(v.vm, other); + TEST_ASSERT(ret < 0 && (errno == EINVAL || errno == EEXIST), "create GIC device while other version exists"); } From 05c2224d4b049406b0545a10be05280ff4b8ba0a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 13 Oct 2022 14:30:20 +0800 Subject: [PATCH 05/13] KVM: selftests: Fix number of pages for memory slot in memslot_modification_stress_test It's required by vm_userspace_mem_region_add() that memory size should be aligned to host page size. However, one guest page is provided by memslot_modification_stress_test. It triggers failure in the scenario of 64KB-page-size-host and 4KB-page-size-guest, as the following messages indicate. # ./memslot_modification_stress_test Testing guest mode: PA-bits:40, VA-bits:48, 4K pages guest physical test memory: [0xffbfff0000, 0xffffff0000) Finished creating vCPUs Started all vCPUs ==== Test Assertion Failure ==== lib/kvm_util.c:824: vm_adjust_num_guest_pages(vm->mode, npages) == npages pid=5712 tid=5712 errno=0 - Success 1 0x0000000000404eeb: vm_userspace_mem_region_add at kvm_util.c:822 2 0x0000000000401a5b: add_remove_memslot at memslot_modification_stress_test.c:82 3 (inlined by) run_test at memslot_modification_stress_test.c:110 4 0x0000000000402417: for_each_guest_mode at guest_modes.c:100 5 0x00000000004016a7: main at memslot_modification_stress_test.c:187 6 0x0000ffffb8cd4383: ?? ??:0 7 0x0000000000401827: _start at :? Number of guest pages is not compatible with the host. Try npages=16 Fix the issue by providing 16 guest pages to the memory slot for this particular combination of 64KB-page-size-host and 4KB-page-size-guest on aarch64. Fixes: ef4c9f4f65462 ("KVM: selftests: Fix 32-bit truncation of vm_get_max_gfn()") Signed-off-by: Gavin Shan Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221013063020.201856-1-gshan@redhat.com --- tools/testing/selftests/kvm/memslot_modification_stress_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 6ee7e1dde404..bb1d17a1171b 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -67,7 +67,7 @@ struct memslot_antagonist_args { static void add_remove_memslot(struct kvm_vm *vm, useconds_t delay, uint64_t nr_modifications) { - const uint64_t pages = 1; + uint64_t pages = max_t(int, vm->page_size, getpagesize()) / vm->page_size; uint64_t gpa; int i; From bde971a83bbff78561458ded236605a365411b87 Mon Sep 17 00:00:00 2001 From: Denis Nikitin Date: Fri, 14 Oct 2022 11:45:32 -0700 Subject: [PATCH 06/13] KVM: arm64: nvhe: Fix build with profile optimization Kernel build with clang and KCFLAGS=-fprofile-sample-use= fails with: error: arch/arm64/kvm/hyp/nvhe/kvm_nvhe.tmp.o: Unexpected SHT_REL section ".rel.llvm.call-graph-profile" Starting from 13.0.0 llvm can generate SHT_REL section, see https://reviews.llvm.org/rGca3bdb57fa1ac98b711a735de048c12b5fdd8086. gen-hyprel does not support SHT_REL relocation section. Filter out profile use flags to fix the build with profile optimization. Signed-off-by: Denis Nikitin Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20221014184532.3153551-1-denik@chromium.org --- arch/arm64/kvm/hyp/nvhe/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 48f6ae7cc6e6..be0a2bc3e20d 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -92,6 +92,10 @@ quiet_cmd_hypcopy = HYPCOPY $@ # Remove ftrace, Shadow Call Stack, and CFI CFLAGS. # This is equivalent to the 'notrace', '__noscs', and '__nocfi' annotations. KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI), $(KBUILD_CFLAGS)) +# Starting from 13.0.0 llvm emits SHT_REL section '.llvm.call-graph-profile' +# when profile optimization is applied. gen-hyprel does not support SHT_REL and +# causes a build failure. Remove profile optimization flags. +KBUILD_CFLAGS := $(filter-out -fprofile-sample-use=% -fprofile-use=%, $(KBUILD_CFLAGS)) # KVM nVHE code is run at a different exception code with a different map, so # compiler instrumentation that inserts callbacks or checks into the code may From c000a2607145d28b06c697f968491372ea56c23a Mon Sep 17 00:00:00 2001 From: Eric Ren Date: Sat, 15 Oct 2022 11:19:28 +0800 Subject: [PATCH 07/13] KVM: arm64: vgic: Fix exit condition in scan_its_table() With some PCIe topologies, restoring a guest fails while parsing the ITS device tables. Reproducer hints: 1. Create ARM virt VM with pxb-pcie bus which adds extra host bridges, with qemu command like: ``` -device pxb-pcie,bus_nr=8,id=pci.x,numa_node=0,bus=pcie.0 \ -device pcie-root-port,..,bus=pci.x \ ... -device pxb-pcie,bus_nr=37,id=pci.y,numa_node=1,bus=pcie.0 \ -device pcie-root-port,..,bus=pci.y \ ... ``` 2. Ensure the guest uses 2-level device table 3. Perform VM migration which calls save/restore device tables In that setup, we get a big "offset" between 2 device_ids, which makes unsigned "len" round up a big positive number, causing the scan loop to continue with a bad GPA. For example: 1. L1 table has 2 entries; 2. and we are now scanning at L2 table entry index 2075 (pointed to by L1 first entry) 3. if next device id is 9472, we will get a big offset: 7397; 4. with unsigned 'len', 'len -= offset * esz', len will underflow to a positive number, mistakenly into next iteration with a bad GPA; (It should break out of the current L2 table scanning, and jump into the next L1 table entry) 5. that bad GPA fails the guest read. Fix it by stopping the L2 table scan when the next device id is outside of the current table, allowing the scan to continue from the next L1 table entry. Thanks to Eric Auger for the fix suggestion. Fixes: 920a7a8fa92a ("KVM: arm64: vgic-its: Add infrastructure for tableookup") Suggested-by: Eric Auger Signed-off-by: Eric Ren [maz: commit message tidy-up] Signed-off-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/d9c3a564af9e2c5bf63f48a7dcbf08cd593c5c0b.1665802985.git.renzhengeek@gmail.com --- arch/arm64/kvm/vgic/vgic-its.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 24d7778d1ce6..733b53055f97 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2149,7 +2149,7 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, memset(entry, 0, esz); - while (len > 0) { + while (true) { int next_offset; size_t byte_offset; @@ -2162,6 +2162,9 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, return next_offset; byte_offset = next_offset * esz; + if (byte_offset >= len) + break; + id += next_offset; gpa += byte_offset; len -= byte_offset; From 5c20a3a9df19811051441214e7f5091cb3546db0 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 21 Oct 2022 11:52:39 +0530 Subject: [PATCH 08/13] RISC-V: Fix compilation without RISCV_ISA_ZICBOM riscv_cbom_block_size and riscv_init_cbom_blocksize() should always be available and riscv_init_cbom_blocksize() should always be invoked, even when compiling without RISCV_ISA_ZICBOM enabled. This is because disabling RISCV_ISA_ZICBOM means "don't use zicbom instructions in the kernel" not "pretend there isn't zicbom, even when there is". When zicbom is available, whether the kernel enables its use with RISCV_ISA_ZICBOM or not, KVM will offer it to guests. Ensure we can build KVM and that the block size is initialized even when compiling without RISCV_ISA_ZICBOM. Fixes: 8f7e001e0325 ("RISC-V: Clean up the Zicbom block size probing") Reported-by: kernel test robot Signed-off-by: Andrew Jones Signed-off-by: Anup Patel Reviewed-by: Conor Dooley Reviewed-by: Heiko Stuebner Tested-by: Heiko Stuebner Signed-off-by: Anup Patel --- arch/riscv/include/asm/cacheflush.h | 8 ------ arch/riscv/mm/cacheflush.c | 38 ++++++++++++++++++++++++++ arch/riscv/mm/dma-noncoherent.c | 41 ----------------------------- 3 files changed, 38 insertions(+), 49 deletions(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8a5c246b0a21..f6fbe7042f1c 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -42,16 +42,8 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #endif /* CONFIG_SMP */ -/* - * The T-Head CMO errata internally probe the CBOM block size, but otherwise - * don't depend on Zicbom. - */ extern unsigned int riscv_cbom_block_size; -#ifdef CONFIG_RISCV_ISA_ZICBOM void riscv_init_cbom_blocksize(void); -#else -static inline void riscv_init_cbom_blocksize(void) { } -#endif #ifdef CONFIG_RISCV_DMA_NONCOHERENT void riscv_noncoherent_supported(void); diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 6cb7d96ad9c7..57b40a350420 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -3,6 +3,7 @@ * Copyright (C) 2017 SiFive */ +#include #include #ifdef CONFIG_SMP @@ -86,3 +87,40 @@ void flush_icache_pte(pte_t pte) flush_icache_all(); } #endif /* CONFIG_MMU */ + +unsigned int riscv_cbom_block_size; +EXPORT_SYMBOL_GPL(riscv_cbom_block_size); + +void riscv_init_cbom_blocksize(void) +{ + struct device_node *node; + unsigned long cbom_hartid; + u32 val, probed_block_size; + int ret; + + probed_block_size = 0; + for_each_of_cpu_node(node) { + unsigned long hartid; + + ret = riscv_of_processor_hartid(node, &hartid); + if (ret) + continue; + + /* set block-size for cbom extension if available */ + ret = of_property_read_u32(node, "riscv,cbom-block-size", &val); + if (ret) + continue; + + if (!probed_block_size) { + probed_block_size = val; + cbom_hartid = hartid; + } else { + if (probed_block_size != val) + pr_warn("cbom-block-size mismatched between harts %lu and %lu\n", + cbom_hartid, hartid); + } + } + + if (probed_block_size) + riscv_cbom_block_size = probed_block_size; +} diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index b0add983530a..d919efab6eba 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -8,13 +8,8 @@ #include #include #include -#include -#include #include -unsigned int riscv_cbom_block_size; -EXPORT_SYMBOL_GPL(riscv_cbom_block_size); - static bool noncoherent_supported; void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, @@ -77,42 +72,6 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, dev->dma_coherent = coherent; } -#ifdef CONFIG_RISCV_ISA_ZICBOM -void riscv_init_cbom_blocksize(void) -{ - struct device_node *node; - unsigned long cbom_hartid; - u32 val, probed_block_size; - int ret; - - probed_block_size = 0; - for_each_of_cpu_node(node) { - unsigned long hartid; - - ret = riscv_of_processor_hartid(node, &hartid); - if (ret) - continue; - - /* set block-size for cbom extension if available */ - ret = of_property_read_u32(node, "riscv,cbom-block-size", &val); - if (ret) - continue; - - if (!probed_block_size) { - probed_block_size = val; - cbom_hartid = hartid; - } else { - if (probed_block_size != val) - pr_warn("cbom-block-size mismatched between harts %lu and %lu\n", - cbom_hartid, hartid); - } - } - - if (probed_block_size) - riscv_cbom_block_size = probed_block_size; -} -#endif - void riscv_noncoherent_supported(void) { WARN(!riscv_cbom_block_size, From cea8896bd936135559253e9b23340cfa1cdf0caf Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Fri, 21 Oct 2022 11:52:45 +0530 Subject: [PATCH 09/13] RISC-V: KVM: Fix kvm_riscv_vcpu_timer_pending() for Sstc The kvm_riscv_vcpu_timer_pending() checks per-VCPU next_cycles and per-VCPU software injected VS timer interrupt. This function returns incorrect value when Sstc is available because the per-VCPU next_cycles are only updated by kvm_riscv_vcpu_timer_save() called from kvm_arch_vcpu_put(). As a result, when Sstc is available the VCPU does not block properly upon WFI traps. To fix the above issue, we introduce kvm_riscv_vcpu_timer_sync() which will update per-VCPU next_cycles upon every VM exit instead of kvm_riscv_vcpu_timer_save(). Fixes: 8f5cb44b1bae ("RISC-V: KVM: Support sstc extension") Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_timer.h | 1 + arch/riscv/kvm/vcpu.c | 3 +++ arch/riscv/kvm/vcpu_timer.c | 27 ++++++++++++++++++------- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/arch/riscv/include/asm/kvm_vcpu_timer.h b/arch/riscv/include/asm/kvm_vcpu_timer.h index 0d8fdb8ec63a..82f7260301da 100644 --- a/arch/riscv/include/asm/kvm_vcpu_timer.h +++ b/arch/riscv/include/asm/kvm_vcpu_timer.h @@ -45,6 +45,7 @@ int kvm_riscv_vcpu_timer_deinit(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_timer_reset(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu); void kvm_riscv_guest_timer_init(struct kvm *kvm); +void kvm_riscv_vcpu_timer_sync(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu); bool kvm_riscv_vcpu_timer_pending(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index a032c4f0d600..71ebbc4821f0 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -708,6 +708,9 @@ void kvm_riscv_vcpu_sync_interrupts(struct kvm_vcpu *vcpu) clear_bit(IRQ_VS_SOFT, &v->irqs_pending); } } + + /* Sync-up timer CSRs */ + kvm_riscv_vcpu_timer_sync(vcpu); } int kvm_riscv_vcpu_set_interrupt(struct kvm_vcpu *vcpu, unsigned int irq) diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index 185f2386a747..ad34519c8a13 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -320,6 +320,21 @@ void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_timer_unblocking(vcpu); } +void kvm_riscv_vcpu_timer_sync(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_timer *t = &vcpu->arch.timer; + + if (!t->sstc_enabled) + return; + +#if defined(CONFIG_32BIT) + t->next_cycles = csr_read(CSR_VSTIMECMP); + t->next_cycles |= (u64)csr_read(CSR_VSTIMECMPH) << 32; +#else + t->next_cycles = csr_read(CSR_VSTIMECMP); +#endif +} + void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu) { struct kvm_vcpu_timer *t = &vcpu->arch.timer; @@ -327,13 +342,11 @@ void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu) if (!t->sstc_enabled) return; - t = &vcpu->arch.timer; -#if defined(CONFIG_32BIT) - t->next_cycles = csr_read(CSR_VSTIMECMP); - t->next_cycles |= (u64)csr_read(CSR_VSTIMECMPH) << 32; -#else - t->next_cycles = csr_read(CSR_VSTIMECMP); -#endif + /* + * The vstimecmp CSRs are saved by kvm_riscv_vcpu_timer_sync() + * upon every VM exit so no need to save here. + */ + /* timer should be enabled for the remaining operations */ if (unlikely(!t->init_done)) return; From ed51862f2f57cbce6fed2d4278cfe70a490899fd Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 17 Oct 2022 20:45:39 +0200 Subject: [PATCH 10/13] kvm: Add support for arch compat vm ioctls We will introduce the first architecture specific compat vm ioctl in the next patch. Add all necessary boilerplate to allow architectures to override compat vm ioctls when necessary. Signed-off-by: Alexander Graf Message-Id: <20221017184541.2658-2-graf@amazon.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32f259fa5801..00c3448ba7f8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1390,6 +1390,8 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg); int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e30f1b4ecfa5..1376a47fedee 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4839,6 +4839,12 @@ struct compat_kvm_clear_dirty_log { }; }; +long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOTTY; +} + static long kvm_vm_compat_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4847,6 +4853,11 @@ static long kvm_vm_compat_ioctl(struct file *filp, if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; + + r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg); + if (r != -ENOTTY) + return r; + switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT case KVM_CLEAR_DIRTY_LOG: { From 2e3272bc1790825c43d2c39690bf2836b81c6d36 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 17 Oct 2022 20:45:40 +0200 Subject: [PATCH 11/13] KVM: x86: Copy filter arg outside kvm_vm_ioctl_set_msr_filter() In the next patch we want to introduce a second caller to set_msr_filter() which constructs its own filter list on the stack. Refactor the original function so it takes it as argument instead of reading it through copy_from_user(). Signed-off-by: Alexander Graf Message-Id: <20221017184541.2658-3-graf@amazon.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4bd5f8a751de..78f779f0264b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6442,26 +6442,22 @@ static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, return 0; } -static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, + struct kvm_msr_filter *filter) { - struct kvm_msr_filter __user *user_msr_filter = argp; struct kvm_x86_msr_filter *new_filter, *old_filter; - struct kvm_msr_filter filter; bool default_allow; bool empty = true; int r = 0; u32 i; - if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) - return -EFAULT; - - if (filter.flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) return -EINVAL; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) - empty &= !filter.ranges[i].nmsrs; + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) + empty &= !filter->ranges[i].nmsrs; - default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY); + default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); if (empty && !default_allow) return -EINVAL; @@ -6469,8 +6465,8 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp) if (!new_filter) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { - r = kvm_add_msr_filter(new_filter, &filter.ranges[i]); + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { + r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); if (r) { kvm_free_msr_filter(new_filter); return r; @@ -6915,9 +6911,16 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_SET_PMU_EVENT_FILTER: r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); break; - case KVM_X86_SET_MSR_FILTER: - r = kvm_vm_ioctl_set_msr_filter(kvm, argp); + case KVM_X86_SET_MSR_FILTER: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); break; + } default: r = -ENOTTY; } From 1739c7017fb1d759965dcbab925ff5980a5318cb Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 17 Oct 2022 20:45:41 +0200 Subject: [PATCH 12/13] KVM: x86: Add compat handler for KVM_X86_SET_MSR_FILTER The KVM_X86_SET_MSR_FILTER ioctls contains a pointer in the passed in struct which means it has a different struct size depending on whether it gets called from 32bit or 64bit code. This patch introduces compat code that converts from the 32bit struct to its 64bit counterpart which then gets used going forward internally. With this applied, 32bit QEMU can successfully set MSR bitmaps when running on 64bit kernels. Reported-by: Andrew Randrianasulu Fixes: 1a155254ff937 ("KVM: x86: Introduce MSR filtering") Signed-off-by: Alexander Graf Message-Id: <20221017184541.2658-4-graf@amazon.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 78f779f0264b..9cf1ba865562 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6489,6 +6489,62 @@ static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, return 0; } +#ifdef CONFIG_KVM_COMPAT +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range_compat { + __u32 flags; + __u32 nmsrs; + __u32 base; + __u32 bitmap; +}; + +struct kvm_msr_filter_compat { + __u32 flags; + struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; + +#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) + +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + long r = -ENOTTY; + + switch (ioctl) { + case KVM_X86_SET_MSR_FILTER_COMPAT: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter_compat filter_compat; + struct kvm_msr_filter filter; + int i; + + if (copy_from_user(&filter_compat, user_msr_filter, + sizeof(filter_compat))) + return -EFAULT; + + filter.flags = filter_compat.flags; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + struct kvm_msr_filter_range_compat *cr; + + cr = &filter_compat.ranges[i]; + filter.ranges[i] = (struct kvm_msr_filter_range) { + .flags = cr->flags, + .nmsrs = cr->nmsrs, + .base = cr->base, + .bitmap = (__u8 *)(ulong)cr->bitmap, + }; + } + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } + } + + return r; +} +#endif + #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_arch_suspend_notifier(struct kvm *kvm) { From 9aec606c1609a5da177b579475a73f6c948e034a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 22 Oct 2022 07:43:52 -0400 Subject: [PATCH 13/13] tools: include: sync include/api/linux/kvm.h Provide a definition of KVM_CAP_DIRTY_LOG_RING_ACQ_REL. Fixes: 17601bfed909 ("KVM: Add KVM_CAP_DIRTY_LOG_RING_ACQ_REL capability and config option") Cc: Marc Zyngier Signed-off-by: Paolo Bonzini --- tools/include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index eed0315a77a6..0d5d4419139a 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1177,6 +1177,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_DISABLE_NX_HUGE_PAGES 220 #define KVM_CAP_S390_ZPCI_OP 221 #define KVM_CAP_S390_CPU_TOPOLOGY 222 +#define KVM_CAP_DIRTY_LOG_RING_ACQ_REL 223 #ifdef KVM_CAP_IRQ_ROUTING