/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2012,2013 - ARM Ltd * Author: Marc Zyngier * * Derived from arch/arm/include/asm/kvm_host.h: * Copyright (C) 2012 - Virtual Open Systems and Columbia University * Author: Christoffer Dall */ #ifndef __ARM64_KVM_HOST_H__ #define __ARM64_KVM_HOST_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define __KVM_HAVE_ARCH_INTC_INITIALIZED #define KVM_HALT_POLL_NS_DEFAULT 500000 #include #include #include #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS #define KVM_VCPU_MAX_FEATURES 7 #define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) #define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) #define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) #define KVM_HAVE_MMU_RWLOCK /* * Mode of operation configurable with kvm-arm.mode early param. * See Documentation/admin-guide/kernel-parameters.txt for more information. */ enum kvm_mode { KVM_MODE_DEFAULT, KVM_MODE_PROTECTED, KVM_MODE_NV, KVM_MODE_NONE, }; #ifdef CONFIG_KVM enum kvm_mode kvm_get_mode(void); #else static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; }; #endif DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int __ro_after_init kvm_sve_max_vl; int __init kvm_arm_init_sve(void); u32 __attribute_const__ kvm_target_cpu(void); void kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); struct kvm_hyp_memcache { phys_addr_t head; unsigned long nr_pages; }; static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, phys_addr_t *p, phys_addr_t (*to_pa)(void *virt)) { *p = mc->head; mc->head = to_pa(p); mc->nr_pages++; } static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc, void *(*to_va)(phys_addr_t phys)) { phys_addr_t *p = to_va(mc->head); if (!mc->nr_pages) return NULL; mc->head = *p; mc->nr_pages--; return p; } static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, void *(*alloc_fn)(void *arg), phys_addr_t (*to_pa)(void *virt), void *arg) { while (mc->nr_pages < min_pages) { phys_addr_t *p = alloc_fn(arg); if (!p) return -ENOMEM; push_hyp_memcache(mc, p, to_pa); } return 0; } static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc, void (*free_fn)(void *virt, void *arg), void *(*to_va)(phys_addr_t phys), void *arg) { while (mc->nr_pages) free_fn(pop_hyp_memcache(mc, to_va), arg); } void free_hyp_memcache(struct kvm_hyp_memcache *mc); int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages); struct kvm_vmid { atomic64_t id; }; struct kvm_s2_mmu { struct kvm_vmid vmid; /* * stage2 entry level table * * Two kvm_s2_mmu structures in the same VM can point to the same * pgd here. This happens when running a guest using a * translation regime that isn't affected by its own stage-2 * translation, such as a non-VHE hypervisor running at vEL2, or * for vEL1/EL0 with vHCR_EL2.VM == 0. In that case, we use the * canonical stage-2 page tables. */ phys_addr_t pgd_phys; struct kvm_pgtable *pgt; /* * VTCR value used on the host. For a non-NV guest (or a NV * guest that runs in a context where its own S2 doesn't * apply), its T0SZ value reflects that of the IPA size. * * For a shadow S2 MMU, T0SZ reflects the PARange exposed to * the guest. */ u64 vtcr; /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; #define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0 /* * Memory cache used to split * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It * is used to allocate stage2 page tables while splitting huge * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE * influences both the capacity of the split page cache, and * how often KVM reschedules. Be wary of raising CHUNK_SIZE * too high. * * Protected by kvm->slots_lock. */ struct kvm_mmu_memory_cache split_page_cache; uint64_t split_page_chunk_size; struct kvm_arch *arch; }; struct kvm_arch_memory_slot { }; /** * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests * * @std_bmap: Bitmap of standard secure service calls * @std_hyp_bmap: Bitmap of standard hypervisor service calls * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls */ struct kvm_smccc_features { unsigned long std_bmap; unsigned long std_hyp_bmap; unsigned long vendor_hyp_bmap; }; typedef unsigned int pkvm_handle_t; struct kvm_protected_vm { pkvm_handle_t handle; struct kvm_hyp_memcache teardown_mc; }; struct kvm_mpidr_data { u64 mpidr_mask; DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx); }; static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr) { unsigned long mask = data->mpidr_mask; u64 aff = mpidr & MPIDR_HWID_BITMASK; int nbits, bit, bit_idx = 0; u16 index = 0; /* * If this looks like RISC-V's BEXT or x86's PEXT * instructions, it isn't by accident. */ nbits = fls(mask); for_each_set_bit(bit, &mask, nbits) { index |= (aff & BIT(bit)) >> (bit - bit_idx); bit_idx++; } return index; } struct kvm_arch { struct kvm_s2_mmu mmu; /* Interrupt controller */ struct vgic_dist vgic; /* Timers */ struct arch_timer_vm_data timer_data; /* Mandated version of PSCI */ u32 psci_version; /* Protects VM-scoped configuration data */ struct mutex config_lock; /* * If we encounter a data abort without valid instruction syndrome * information, report this to user space. User space can (and * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is * supported. */ #define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER 0 /* Memory Tagging Extension enabled for the guest */ #define KVM_ARCH_FLAG_MTE_ENABLED 1 /* At least one vCPU has ran in the VM */ #define KVM_ARCH_FLAG_HAS_RAN_ONCE 2 /* The vCPU feature set for the VM is configured */ #define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED 3 /* PSCI SYSTEM_SUSPEND enabled for the guest */ #define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 4 /* VM counter offset */ #define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 5 /* Timer PPIs made immutable */ #define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6 /* Initial ID reg values loaded */ #define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 7 unsigned long flags; /* VM-wide vCPU feature set */ DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES); /* MPIDR to vcpu index mapping, optional */ struct kvm_mpidr_data *mpidr_data; /* * VM-wide PMU filter, implemented as a bitmap and big enough for * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+). */ unsigned long *pmu_filter; struct arm_pmu *arm_pmu; cpumask_var_t supported_cpus; /* PMCR_EL0.N value for the guest */ u8 pmcr_n; /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; /* * Emulated CPU ID registers per VM * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. * * These emulated idregs are VM-wide, but accessed from the context of a vCPU. * Atomic access to multiple idregs are guarded by kvm_arch.config_lock. */ #define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id)) #define IDX_IDREG(idx) sys_reg(3, 0, 0, ((idx) >> 3) + 1, (idx) & Op2_mask) #define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)]) #define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) u64 id_regs[KVM_ARM_ID_REG_NUM]; /* * For an untrusted host VM, 'pkvm.handle' is used to lookup * the associated pKVM instance in the hypervisor. */ struct kvm_protected_vm pkvm; }; struct kvm_vcpu_fault_info { u64 esr_el2; /* Hyp Syndrom Register */ u64 far_el2; /* Hyp Fault Address Register */ u64 hpfar_el2; /* Hyp IPA Fault Address Register */ u64 disr_el1; /* Deferred [SError] Status Register */ }; /* * VNCR() just places the VNCR_capable registers in the enum after * __VNCR_START__, and the value (after correction) to be an 8-byte offset * from the VNCR base. As we don't require the enum to be otherwise ordered, * we need the terrible hack below to ensure that we correctly size the * sys_regs array, no matter what. * * The __MAX__ macro has been lifted from Sean Eron Anderson's wonderful * treasure trove of bit hacks: * https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax */ #define __MAX__(x,y) ((x) ^ (((x) ^ (y)) & -((x) < (y)))) #define VNCR(r) \ __before_##r, \ r = __VNCR_START__ + ((VNCR_ ## r) / 8), \ __after_##r = __MAX__(__before_##r - 1, r) enum vcpu_sysreg { __INVALID_SYSREG__, /* 0 is reserved as an invalid value */ MPIDR_EL1, /* MultiProcessor Affinity Register */ CLIDR_EL1, /* Cache Level ID Register */ CSSELR_EL1, /* Cache Size Selection Register */ TPIDR_EL0, /* Thread ID, User R/W */ TPIDRRO_EL0, /* Thread ID, User R/O */ TPIDR_EL1, /* Thread ID, Privileged */ CNTKCTL_EL1, /* Timer Control Register (EL1) */ PAR_EL1, /* Physical Address Register */ MDCCINT_EL1, /* Monitor Debug Comms Channel Interrupt Enable Reg */ OSLSR_EL1, /* OS Lock Status Register */ DISR_EL1, /* Deferred Interrupt Status Register */ /* Performance Monitors Registers */ PMCR_EL0, /* Control Register */ PMSELR_EL0, /* Event Counter Selection Register */ PMEVCNTR0_EL0, /* Event Counter Register (0-30) */ PMEVCNTR30_EL0 = PMEVCNTR0_EL0 + 30, PMCCNTR_EL0, /* Cycle Counter Register */ PMEVTYPER0_EL0, /* Event Type Register (0-30) */ PMEVTYPER30_EL0 = PMEVTYPER0_EL0 + 30, PMCCFILTR_EL0, /* Cycle Count Filter Register */ PMCNTENSET_EL0, /* Count Enable Set Register */ PMINTENSET_EL1, /* Interrupt Enable Set Register */ PMOVSSET_EL0, /* Overflow Flag Status Set Register */ PMUSERENR_EL0, /* User Enable Register */ /* Pointer Authentication Registers in a strict increasing order. */ APIAKEYLO_EL1, APIAKEYHI_EL1, APIBKEYLO_EL1, APIBKEYHI_EL1, APDAKEYLO_EL1, APDAKEYHI_EL1, APDBKEYLO_EL1, APDBKEYHI_EL1, APGAKEYLO_EL1, APGAKEYHI_EL1, /* Memory Tagging Extension registers */ RGSR_EL1, /* Random Allocation Tag Seed Register */ GCR_EL1, /* Tag Control Register */ TFSRE0_EL1, /* Tag Fault Status Register (EL0) */ /* 32bit specific registers. */ DACR32_EL2, /* Domain Access Control Register */ IFSR32_EL2, /* Instruction Fault Status Register */ FPEXC32_EL2, /* Floating-Point Exception Control Register */ DBGVCR32_EL2, /* Debug Vector Catch Register */ /* EL2 registers */ SCTLR_EL2, /* System Control Register (EL2) */ ACTLR_EL2, /* Auxiliary Control Register (EL2) */ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ CPTR_EL2, /* Architectural Feature Trap Register (EL2) */ HACR_EL2, /* Hypervisor Auxiliary Control Register */ TTBR0_EL2, /* Translation Table Base Register 0 (EL2) */ TTBR1_EL2, /* Translation Table Base Register 1 (EL2) */ TCR_EL2, /* Translation Control Register (EL2) */ SPSR_EL2, /* EL2 saved program status register */ ELR_EL2, /* EL2 exception link register */ AFSR0_EL2, /* Auxiliary Fault Status Register 0 (EL2) */ AFSR1_EL2, /* Auxiliary Fault Status Register 1 (EL2) */ ESR_EL2, /* Exception Syndrome Register (EL2) */ FAR_EL2, /* Fault Address Register (EL2) */ HPFAR_EL2, /* Hypervisor IPA Fault Address Register */ MAIR_EL2, /* Memory Attribute Indirection Register (EL2) */ AMAIR_EL2, /* Auxiliary Memory Attribute Indirection Register (EL2) */ VBAR_EL2, /* Vector Base Address Register (EL2) */ RVBAR_EL2, /* Reset Vector Base Address Register */ CONTEXTIDR_EL2, /* Context ID Register (EL2) */ CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ SP_EL2, /* EL2 Stack Pointer */ CNTHP_CTL_EL2, CNTHP_CVAL_EL2, CNTHV_CTL_EL2, CNTHV_CVAL_EL2, __VNCR_START__, /* Any VNCR-capable reg goes after this point */ VNCR(SCTLR_EL1),/* System Control Register */ VNCR(ACTLR_EL1),/* Auxiliary Control Register */ VNCR(CPACR_EL1),/* Coprocessor Access Control */ VNCR(ZCR_EL1), /* SVE Control */ VNCR(TTBR0_EL1),/* Translation Table Base Register 0 */ VNCR(TTBR1_EL1),/* Translation Table Base Register 1 */ VNCR(TCR_EL1), /* Translation Control Register */ VNCR(TCR2_EL1), /* Extended Translation Control Register */ VNCR(ESR_EL1), /* Exception Syndrome Register */ VNCR(AFSR0_EL1),/* Auxiliary Fault Status Register 0 */ VNCR(AFSR1_EL1),/* Auxiliary Fault Status Register 1 */ VNCR(FAR_EL1), /* Fault Address Register */ VNCR(MAIR_EL1), /* Memory Attribute Indirection Register */ VNCR(VBAR_EL1), /* Vector Base Address Register */ VNCR(CONTEXTIDR_EL1), /* Context ID Register */ VNCR(AMAIR_EL1),/* Aux Memory Attribute Indirection Register */ VNCR(MDSCR_EL1),/* Monitor Debug System Control Register */ VNCR(ELR_EL1), VNCR(SP_EL1), VNCR(SPSR_EL1), VNCR(TFSR_EL1), /* Tag Fault Status Register (EL1) */ VNCR(VPIDR_EL2),/* Virtualization Processor ID Register */ VNCR(VMPIDR_EL2),/* Virtualization Multiprocessor ID Register */ VNCR(HCR_EL2), /* Hypervisor Configuration Register */ VNCR(HSTR_EL2), /* Hypervisor System Trap Register */ VNCR(VTTBR_EL2),/* Virtualization Translation Table Base Register */ VNCR(VTCR_EL2), /* Virtualization Translation Control Register */ VNCR(TPIDR_EL2),/* EL2 Software Thread ID Register */ VNCR(HCRX_EL2), /* Extended Hypervisor Configuration Register */ /* Permission Indirection Extension registers */ VNCR(PIR_EL1), /* Permission Indirection Register 1 (EL1) */ VNCR(PIRE0_EL1), /* Permission Indirection Register 0 (EL1) */ VNCR(HFGRTR_EL2), VNCR(HFGWTR_EL2), VNCR(HFGITR_EL2), VNCR(HDFGRTR_EL2), VNCR(HDFGWTR_EL2), VNCR(HAFGRTR_EL2), VNCR(CNTVOFF_EL2), VNCR(CNTV_CVAL_EL0), VNCR(CNTV_CTL_EL0), VNCR(CNTP_CVAL_EL0), VNCR(CNTP_CTL_EL0), NR_SYS_REGS /* Nothing after this line! */ }; struct kvm_cpu_context { struct user_pt_regs regs; /* sp = sp_el0 */ u64 spsr_abt; u64 spsr_und; u64 spsr_irq; u64 spsr_fiq; struct user_fpsimd_state fp_regs; u64 sys_regs[NR_SYS_REGS]; struct kvm_vcpu *__hyp_running_vcpu; /* This pointer has to be 4kB aligned. */ u64 *vncr_array; }; struct kvm_host_data { struct kvm_cpu_context host_ctxt; }; struct kvm_host_psci_config { /* PSCI version used by host. */ u32 version; u32 smccc_version; /* Function IDs used by host if version is v0.1. */ struct psci_0_1_function_ids function_ids_0_1; bool psci_0_1_cpu_suspend_implemented; bool psci_0_1_cpu_on_implemented; bool psci_0_1_cpu_off_implemented; bool psci_0_1_migrate_implemented; }; extern struct kvm_host_psci_config kvm_nvhe_sym(kvm_host_psci_config); #define kvm_host_psci_config CHOOSE_NVHE_SYM(kvm_host_psci_config) extern s64 kvm_nvhe_sym(hyp_physvirt_offset); #define hyp_physvirt_offset CHOOSE_NVHE_SYM(hyp_physvirt_offset) extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS]; #define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map) struct vcpu_reset_state { unsigned long pc; unsigned long r0; bool be; bool reset; }; struct kvm_vcpu_arch { struct kvm_cpu_context ctxt; /* * Guest floating point state * * The architecture has two main floating point extensions, * the original FPSIMD and SVE. These have overlapping * register views, with the FPSIMD V registers occupying the * low 128 bits of the SVE Z registers. When the core * floating point code saves the register state of a task it * records which view it saved in fp_type. */ void *sve_state; enum fp_type fp_type; unsigned int sve_max_vl; u64 svcr; u64 fpmr; /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; /* Values of trap registers for the guest. */ u64 hcr_el2; u64 mdcr_el2; u64 cptr_el2; /* Values of trap registers for the host before guest entry. */ u64 mdcr_el2_host; /* Exception Information */ struct kvm_vcpu_fault_info fault; /* Ownership of the FP regs */ enum { FP_STATE_FREE, FP_STATE_HOST_OWNED, FP_STATE_GUEST_OWNED, } fp_state; /* Configuration flags, set once and for all before the vcpu can run */ u8 cflags; /* Input flags to the hypervisor code, potentially cleared after use */ u8 iflags; /* State flags for kernel bookkeeping, unused by the hypervisor code */ u8 sflags; /* * Don't run the guest (internal implementation need). * * Contrary to the flags above, this is set/cleared outside of * a vcpu context, and thus cannot be mixed with the flags * themselves (or the flag accesses need to be made atomic). */ bool pause; /* * We maintain more than a single set of debug registers to support * debugging the guest from the host and to maintain separate host and * guest state during world switches. vcpu_debug_state are the debug * registers of the vcpu as the guest sees them. host_debug_state are * the host registers which are saved and restored during * world switches. external_debug_state contains the debug * values we want to debug the guest. This is set via the * KVM_SET_GUEST_DEBUG ioctl. * * debug_ptr points to the set of debug registers that should be loaded * onto the hardware when running the guest. */ struct kvm_guest_debug_arch *debug_ptr; struct kvm_guest_debug_arch vcpu_debug_state; struct kvm_guest_debug_arch external_debug_state; struct user_fpsimd_state *host_fpsimd_state; /* hyp VA */ struct task_struct *parent_task; struct { /* {Break,watch}point registers */ struct kvm_guest_debug_arch regs; /* Statistical profiling extension */ u64 pmscr_el1; /* Self-hosted trace */ u64 trfcr_el1; } host_debug_state; /* VGIC state */ struct vgic_cpu vgic_cpu; struct arch_timer_cpu timer_cpu; struct kvm_pmu pmu; /* * Guest registers we preserve during guest debugging. * * These shadow registers are updated by the kvm_handle_sys_reg * trap handler if the guest accesses or updates them while we * are using guest debug. */ struct { u32 mdscr_el1; bool pstate_ss; } guest_debug_preserved; /* vcpu power state */ struct kvm_mp_state mp_state; spinlock_t mp_state_lock; /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; /* Virtual SError ESR to restore when HCR_EL2.VSE is set */ u64 vsesr_el2; /* Additional reset state */ struct vcpu_reset_state reset_state; /* Guest PV state */ struct { u64 last_steal; gpa_t base; } steal; /* Per-vcpu CCSIDR override or NULL */ u32 *ccsidr; }; /* * Each 'flag' is composed of a comma-separated triplet: * * - the flag-set it belongs to in the vcpu->arch structure * - the value for that flag * - the mask for that flag * * __vcpu_single_flag() builds such a triplet for a single-bit flag. * unpack_vcpu_flag() extract the flag value from the triplet for * direct use outside of the flag accessors. */ #define __vcpu_single_flag(_set, _f) _set, (_f), (_f) #define __unpack_flag(_set, _f, _m) _f #define unpack_vcpu_flag(...) __unpack_flag(__VA_ARGS__) #define __build_check_flag(v, flagset, f, m) \ do { \ typeof(v->arch.flagset) *_fset; \ \ /* Check that the flags fit in the mask */ \ BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m))); \ /* Check that the flags fit in the type */ \ BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m)); \ } while (0) #define __vcpu_get_flag(v, flagset, f, m) \ ({ \ __build_check_flag(v, flagset, f, m); \ \ READ_ONCE(v->arch.flagset) & (m); \ }) /* * Note that the set/clear accessors must be preempt-safe in order to * avoid nesting them with load/put which also manipulate flags... */ #ifdef __KVM_NVHE_HYPERVISOR__ /* the nVHE hypervisor is always non-preemptible */ #define __vcpu_flags_preempt_disable() #define __vcpu_flags_preempt_enable() #else #define __vcpu_flags_preempt_disable() preempt_disable() #define __vcpu_flags_preempt_enable() preempt_enable() #endif #define __vcpu_set_flag(v, flagset, f, m) \ do { \ typeof(v->arch.flagset) *fset; \ \ __build_check_flag(v, flagset, f, m); \ \ fset = &v->arch.flagset; \ __vcpu_flags_preempt_disable(); \ if (HWEIGHT(m) > 1) \ *fset &= ~(m); \ *fset |= (f); \ __vcpu_flags_preempt_enable(); \ } while (0) #define __vcpu_clear_flag(v, flagset, f, m) \ do { \ typeof(v->arch.flagset) *fset; \ \ __build_check_flag(v, flagset, f, m); \ \ fset = &v->arch.flagset; \ __vcpu_flags_preempt_disable(); \ *fset &= ~(m); \ __vcpu_flags_preempt_enable(); \ } while (0) #define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__) #define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) #define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) /* SVE exposed to guest */ #define GUEST_HAS_SVE __vcpu_single_flag(cflags, BIT(0)) /* SVE config completed */ #define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) /* PTRAUTH exposed to guest */ #define GUEST_HAS_PTRAUTH __vcpu_single_flag(cflags, BIT(2)) /* KVM_ARM_VCPU_INIT completed */ #define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(3)) /* Exception pending */ #define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) /* * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't * be set together with an exception... */ #define INCREMENT_PC __vcpu_single_flag(iflags, BIT(1)) /* Target EL/MODE (not a single flag, but let's abuse the macro) */ #define EXCEPT_MASK __vcpu_single_flag(iflags, GENMASK(3, 1)) /* Helpers to encode exceptions with minimum fuss */ #define __EXCEPT_MASK_VAL unpack_vcpu_flag(EXCEPT_MASK) #define __EXCEPT_SHIFT __builtin_ctzl(__EXCEPT_MASK_VAL) #define __vcpu_except_flags(_f) iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL /* * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following * values: * * For AArch32 EL1: */ #define EXCEPT_AA32_UND __vcpu_except_flags(0) #define EXCEPT_AA32_IABT __vcpu_except_flags(1) #define EXCEPT_AA32_DABT __vcpu_except_flags(2) /* For AArch64: */ #define EXCEPT_AA64_EL1_SYNC __vcpu_except_flags(0) #define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1) #define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2) #define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3) /* For AArch64 with NV: */ #define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4) #define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) #define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) #define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) /* Guest debug is live */ #define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) /* Save SPE context if active */ #define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5)) /* Save TRBE context if active */ #define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) /* vcpu running in HYP context */ #define VCPU_HYP_CONTEXT __vcpu_single_flag(iflags, BIT(7)) /* SVE enabled for host EL0 */ #define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) /* SME enabled for EL0 */ #define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) /* Physical CPU not in supported_cpus */ #define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) /* WFIT instruction trapped */ #define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) /* vcpu system registers loaded on physical CPU */ #define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) /* Software step state is Active-pending */ #define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) /* PMUSERENR for the guest EL0 is on physical CPU */ #define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6)) /* WFI instruction trapped */ #define IN_WFI __vcpu_single_flag(sflags, BIT(7)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl)) #define vcpu_sve_max_vq(vcpu) sve_vq_from_vl((vcpu)->arch.sve_max_vl) #define vcpu_sve_state_size(vcpu) ({ \ size_t __size_ret; \ unsigned int __vcpu_vq; \ \ if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \ __size_ret = 0; \ } else { \ __vcpu_vq = vcpu_sve_max_vq(vcpu); \ __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \ } \ \ __size_ret; \ }) #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ KVM_GUESTDBG_SINGLESTEP) #define vcpu_has_sve(vcpu) (system_supports_sve() && \ vcpu_get_flag(vcpu, GUEST_HAS_SVE)) #ifdef CONFIG_ARM64_PTR_AUTH #define vcpu_has_ptrauth(vcpu) \ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \ vcpu_get_flag(vcpu, GUEST_HAS_PTRAUTH)) #else #define vcpu_has_ptrauth(vcpu) false #endif #define vcpu_on_unsupported_cpu(vcpu) \ vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_set_on_unsupported_cpu(vcpu) \ vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_clear_on_unsupported_cpu(vcpu) \ vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_gp_regs(v) (&(v)->arch.ctxt.regs) /* * Only use __vcpu_sys_reg/ctxt_sys_reg if you know you want the * memory backed version of a register, and not the one most recently * accessed by a running VCPU. For example, for userspace access or * for system registers that are never context switched, but only * emulated. * * Don't bother with VNCR-based accesses in the nVHE code, it has no * business dealing with NV. */ static inline u64 *__ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) { #if !defined (__KVM_NVHE_HYPERVISOR__) if (unlikely(cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) && r >= __VNCR_START__ && ctxt->vncr_array)) return &ctxt->vncr_array[r - __VNCR_START__]; #endif return (u64 *)&ctxt->sys_regs[r]; } #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) #define __vcpu_sys_reg(v,r) (ctxt_sys_reg(&(v)->arch.ctxt, (r))) u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val) { /* * *** VHE ONLY *** * * System registers listed in the switch are not saved on every * exit from the guest but are only saved on vcpu_put. * * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but * should never be listed below, because the guest cannot modify its * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's * thread when emulating cross-VCPU communication. */ if (!has_vhe()) return false; switch (reg) { case SCTLR_EL1: *val = read_sysreg_s(SYS_SCTLR_EL12); break; case CPACR_EL1: *val = read_sysreg_s(SYS_CPACR_EL12); break; case TTBR0_EL1: *val = read_sysreg_s(SYS_TTBR0_EL12); break; case TTBR1_EL1: *val = read_sysreg_s(SYS_TTBR1_EL12); break; case TCR_EL1: *val = read_sysreg_s(SYS_TCR_EL12); break; case ESR_EL1: *val = read_sysreg_s(SYS_ESR_EL12); break; case AFSR0_EL1: *val = read_sysreg_s(SYS_AFSR0_EL12); break; case AFSR1_EL1: *val = read_sysreg_s(SYS_AFSR1_EL12); break; case FAR_EL1: *val = read_sysreg_s(SYS_FAR_EL12); break; case MAIR_EL1: *val = read_sysreg_s(SYS_MAIR_EL12); break; case VBAR_EL1: *val = read_sysreg_s(SYS_VBAR_EL12); break; case CONTEXTIDR_EL1: *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break; case TPIDR_EL0: *val = read_sysreg_s(SYS_TPIDR_EL0); break; case TPIDRRO_EL0: *val = read_sysreg_s(SYS_TPIDRRO_EL0); break; case TPIDR_EL1: *val = read_sysreg_s(SYS_TPIDR_EL1); break; case AMAIR_EL1: *val = read_sysreg_s(SYS_AMAIR_EL12); break; case CNTKCTL_EL1: *val = read_sysreg_s(SYS_CNTKCTL_EL12); break; case ELR_EL1: *val = read_sysreg_s(SYS_ELR_EL12); break; case SPSR_EL1: *val = read_sysreg_s(SYS_SPSR_EL12); break; case PAR_EL1: *val = read_sysreg_par(); break; case DACR32_EL2: *val = read_sysreg_s(SYS_DACR32_EL2); break; case IFSR32_EL2: *val = read_sysreg_s(SYS_IFSR32_EL2); break; case DBGVCR32_EL2: *val = read_sysreg_s(SYS_DBGVCR32_EL2); break; default: return false; } return true; } static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg) { /* * *** VHE ONLY *** * * System registers listed in the switch are not restored on every * entry to the guest but are only restored on vcpu_load. * * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but * should never be listed below, because the MPIDR should only be set * once, before running the VCPU, and never changed later. */ if (!has_vhe()) return false; switch (reg) { case SCTLR_EL1: write_sysreg_s(val, SYS_SCTLR_EL12); break; case CPACR_EL1: write_sysreg_s(val, SYS_CPACR_EL12); break; case TTBR0_EL1: write_sysreg_s(val, SYS_TTBR0_EL12); break; case TTBR1_EL1: write_sysreg_s(val, SYS_TTBR1_EL12); break; case TCR_EL1: write_sysreg_s(val, SYS_TCR_EL12); break; case ESR_EL1: write_sysreg_s(val, SYS_ESR_EL12); break; case AFSR0_EL1: write_sysreg_s(val, SYS_AFSR0_EL12); break; case AFSR1_EL1: write_sysreg_s(val, SYS_AFSR1_EL12); break; case FAR_EL1: write_sysreg_s(val, SYS_FAR_EL12); break; case MAIR_EL1: write_sysreg_s(val, SYS_MAIR_EL12); break; case VBAR_EL1: write_sysreg_s(val, SYS_VBAR_EL12); break; case CONTEXTIDR_EL1: write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break; case TPIDR_EL0: write_sysreg_s(val, SYS_TPIDR_EL0); break; case TPIDRRO_EL0: write_sysreg_s(val, SYS_TPIDRRO_EL0); break; case TPIDR_EL1: write_sysreg_s(val, SYS_TPIDR_EL1); break; case AMAIR_EL1: write_sysreg_s(val, SYS_AMAIR_EL12); break; case CNTKCTL_EL1: write_sysreg_s(val, SYS_CNTKCTL_EL12); break; case ELR_EL1: write_sysreg_s(val, SYS_ELR_EL12); break; case SPSR_EL1: write_sysreg_s(val, SYS_SPSR_EL12); break; case PAR_EL1: write_sysreg_s(val, SYS_PAR_EL1); break; case DACR32_EL2: write_sysreg_s(val, SYS_DACR32_EL2); break; case IFSR32_EL2: write_sysreg_s(val, SYS_IFSR32_EL2); break; case DBGVCR32_EL2: write_sysreg_s(val, SYS_DBGVCR32_EL2); break; default: return false; } return true; } struct kvm_vm_stat { struct kvm_vm_stat_generic generic; }; struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; u64 mmio_exit_kernel; u64 signal_exits; u64 exits; }; unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); #define vcpu_has_run_once(vcpu) !!rcu_access_pointer((vcpu)->pid) #ifndef __KVM_NVHE_HYPERVISOR__ #define kvm_call_hyp_nvhe(f, ...) \ ({ \ struct arm_smccc_res res; \ \ arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f), \ ##__VA_ARGS__, &res); \ WARN_ON(res.a0 != SMCCC_RET_SUCCESS); \ \ res.a1; \ }) /* * The couple of isb() below are there to guarantee the same behaviour * on VHE as on !VHE, where the eret to EL1 acts as a context * synchronization event. */ #define kvm_call_hyp(f, ...) \ do { \ if (has_vhe()) { \ f(__VA_ARGS__); \ isb(); \ } else { \ kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ } while(0) #define kvm_call_hyp_ret(f, ...) \ ({ \ typeof(f(__VA_ARGS__)) ret; \ \ if (has_vhe()) { \ ret = f(__VA_ARGS__); \ isb(); \ } else { \ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ \ ret; \ }) #else /* __KVM_NVHE_HYPERVISOR__ */ #define kvm_call_hyp(f, ...) f(__VA_ARGS__) #define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__) #define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__) #endif /* __KVM_NVHE_HYPERVISOR__ */ int handle_exit(struct kvm_vcpu *vcpu, int exception_index); void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu); int kvm_handle_cp14_32(struct kvm_vcpu *vcpu); int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); int __init kvm_sys_reg_table_init(void); int __init populate_nv_trap_config(void); bool lock_all_vcpus(struct kvm *kvm); void unlock_all_vcpus(struct kvm *kvm); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); /* * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event, * arrived in guest context. For arm64, any event that arrives while a vCPU is * loaded is considered to be "in guest". */ static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu) { return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu; } long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu); gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu); void kvm_update_stolen_time(struct kvm_vcpu *vcpu); bool kvm_arm_pvtime_supported(void); int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); extern unsigned int __ro_after_init kvm_arm_vmid_bits; int __init kvm_arm_vmid_alloc_init(void); void __init kvm_arm_vmid_alloc_free(void); bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) { vcpu_arch->steal.base = INVALID_GPA; } static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch) { return (vcpu_arch->steal.base != INVALID_GPA); } void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome); struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data); static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) { /* The host's MPIDR is immutable, so let's set it up at boot time */ ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr(); } static inline bool kvm_system_needs_idmapped_vectors(void) { return cpus_have_final_cap(ARM64_SPECTRE_V3A); } static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} void kvm_arm_init_debug(void); void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); #define kvm_vcpu_os_lock_enabled(vcpu) \ (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK)) int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm, struct kvm_arm_copy_mte_tags *copy_tags); int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm, struct kvm_arm_counter_offset *offset); int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range); /* Guest/host FPSIMD coordination helpers */ int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu); void kvm_vcpu_unshare_task_fp(struct kvm_vcpu *vcpu); static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) { return (!has_vhe() && attr->exclude_host); } /* Flags for host debug state */ void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u32 clr); bool kvm_set_pmuserenr(u64 val); #else static inline void kvm_set_pmu_events(u32 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u32 clr) {} static inline bool kvm_set_pmuserenr(u64 val) { return false; } #endif void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu); void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu); int __init kvm_set_ipa_limit(void); #define __KVM_HAVE_ARCH_VM_ALLOC struct kvm *kvm_arch_alloc_vm(void); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE static inline bool kvm_vm_is_protected(struct kvm *kvm) { return false; } int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); #define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED) #define kvm_has_mte(kvm) \ (system_supports_mte() && \ test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags)) #define kvm_supports_32bit_el0() \ (system_supports_32bit_el0() && \ !static_branch_unlikely(&arm64_mismatched_32bit_el0)) #define kvm_vm_has_ran_once(kvm) \ (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags)) static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature) { return test_bit(feature, ka->vcpu_features); } #define vcpu_has_feature(v, f) __vcpu_has_feature(&(v)->kvm->arch, (f)) int kvm_trng_call(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM extern phys_addr_t hyp_mem_base; extern phys_addr_t hyp_mem_size; void __init kvm_hyp_reserve(void); #else static inline void kvm_hyp_reserve(void) { } #endif void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu); bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu); #endif /* __ARM64_KVM_HOST_H__ */