From e9ada6c208c15c907afe5afb1aa82e23e81eb8ba Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:13 +0100 Subject: [PATCH 01/69] KVM: arm64: Drop FP_FOREIGN_STATE from the hypervisor code The vcpu KVM_ARM64_FP_FOREIGN_FPSTATE flag tracks the thread's own TIF_FOREIGN_FPSTATE so that we can evaluate just before running the vcpu whether it the FP regs contain something that is owned by the vcpu or not by updating the rest of the FP flags. We do this in the hypervisor code in order to make sure we're in a context where we are not interruptible. But we already have a hook in the run loop to generate this flag. We may as well update the FP flags directly and save the pointless flag tracking. Whilst we're at it, rename update_fp_enabled() to guest_owns_fp_regs() to indicate what the leftover of this helper actually do. Signed-off-by: Marc Zyngier Reviewed-by: Reiji Watanabe Reviewed-by: Mark Brown --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/fpsimd.c | 17 ++++++++++------- arch/arm64/kvm/hyp/include/hyp/switch.h | 16 ++-------------- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- 5 files changed, 14 insertions(+), 24 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 47a1e25e25bb..63103cc1bdc4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -462,7 +462,6 @@ struct kvm_vcpu_arch { #define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ #define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ -#define KVM_ARM64_FP_FOREIGN_FPSTATE (1 << 14) #define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ #define KVM_ARM64_HOST_SME_ENABLED (1 << 16) /* SME enabled for EL0 */ #define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 6012b08ecb14..edbc0183c89b 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -107,16 +107,19 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) } /* - * Called just before entering the guest once we are no longer - * preemptable. Syncs the host's TIF_FOREIGN_FPSTATE with the KVM - * mirror of the flag used by the hypervisor. + * Called just before entering the guest once we are no longer preemptable + * and interrupts are disabled. If we have managed to run anything using + * FP while we were preemptible (such as off the back of an interrupt), + * then neither the host nor the guest own the FP hardware (and it was the + * responsibility of the code that used FP to save the existing state). + * + * Note that not supporting FP is basically the same thing as far as the + * hypervisor is concerned (nothing to save). */ void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) { - if (test_thread_flag(TIF_FOREIGN_FPSTATE)) - vcpu->arch.flags |= KVM_ARM64_FP_FOREIGN_FPSTATE; - else - vcpu->arch.flags &= ~KVM_ARM64_FP_FOREIGN_FPSTATE; + if (!system_supports_fpsimd() || test_thread_flag(TIF_FOREIGN_FPSTATE)) + vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | KVM_ARM64_FP_HOST); } /* diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 37d9f211c200..e54320384943 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -37,21 +37,9 @@ struct kvm_exception_table_entry { extern struct kvm_exception_table_entry __start___kvm_ex_table; extern struct kvm_exception_table_entry __stop___kvm_ex_table; -/* Check whether the FP regs were dirtied while in the host-side run loop: */ -static inline bool update_fp_enabled(struct kvm_vcpu *vcpu) +/* Check whether the FP regs are owned by the guest */ +static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu) { - /* - * When the system doesn't support FP/SIMD, we cannot rely on - * the _TIF_FOREIGN_FPSTATE flag. However, we always inject an - * abort on the very first access to FP and thus we should never - * see KVM_ARM64_FP_ENABLED. For added safety, make sure we always - * trap the accesses. - */ - if (!system_supports_fpsimd() || - vcpu->arch.flags & KVM_ARM64_FP_FOREIGN_FPSTATE) - vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | - KVM_ARM64_FP_HOST); - return !!(vcpu->arch.flags & KVM_ARM64_FP_ENABLED); } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6db801db8f27..a6b9f1186577 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -43,7 +43,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val = vcpu->arch.cptr_el2; val |= CPTR_EL2_TTA | CPTR_EL2_TAM; - if (!update_fp_enabled(vcpu)) { + if (!guest_owns_fp_regs(vcpu)) { val |= CPTR_EL2_TFP | CPTR_EL2_TZ; __activate_traps_fpsimd32(vcpu); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 969f20daf97a..46f365254e9f 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -55,7 +55,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) val |= CPTR_EL2_TAM; - if (update_fp_enabled(vcpu)) { + if (guest_owns_fp_regs(vcpu)) { if (vcpu_has_sve(vcpu)) val |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; } else { From f8077b0d59230cbb58e0b98839e04b564529a5ac Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:14 +0100 Subject: [PATCH 02/69] KVM: arm64: Move FP state ownership from flag to a tristate The KVM FP code uses a pair of flags to denote three states: - FP_ENABLED set: the guest owns the FP state - FP_HOST set: the host owns the FP state - FP_ENABLED and FP_HOST clear: nobody owns the FP state at all and both flags set is an illegal state, which nothing ever checks for... As it turns out, this isn't really a good match for flags, and we'd be better off if this was a simpler tristate, each state having a name that actually reflect the state: - FP_STATE_FREE - FP_STATE_HOST_OWNED - FP_STATE_GUEST_OWNED Kill the two flags, and move over to an enum encoding these three states. This results in less confusing code, and less risk of ending up in the uncharted territory of a 4th state if we forget to clear one of the two flags. Signed-off-by: Marc Zyngier Reviewed-by: Mark Brown Reviewed-by: Reiji Watanabe --- arch/arm64/include/asm/kvm_host.h | 9 +++++++-- arch/arm64/kvm/fpsimd.c | 14 ++++++-------- arch/arm64/kvm/hyp/include/hyp/switch.h | 8 +++----- arch/arm64/kvm/hyp/nvhe/switch.c | 4 ++-- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 63103cc1bdc4..372c5642cfab 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -325,6 +325,13 @@ struct kvm_vcpu_arch { /* Exception Information */ struct kvm_vcpu_fault_info fault; + /* Ownership of the FP regs */ + enum { + FP_STATE_FREE, + FP_STATE_HOST_OWNED, + FP_STATE_GUEST_OWNED, + } fp_state; + /* Miscellaneous vcpu state flags */ u64 flags; @@ -430,8 +437,6 @@ struct kvm_vcpu_arch { /* vcpu_arch flags field values: */ #define KVM_ARM64_DEBUG_DIRTY (1 << 0) -#define KVM_ARM64_FP_ENABLED (1 << 1) /* guest FP regs loaded */ -#define KVM_ARM64_FP_HOST (1 << 2) /* host FP regs loaded */ #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ #define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ #define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index edbc0183c89b..d397efe1a378 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -77,8 +77,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) BUG_ON(!current->mm); BUG_ON(test_thread_flag(TIF_SVE)); - vcpu->arch.flags &= ~KVM_ARM64_FP_ENABLED; - vcpu->arch.flags |= KVM_ARM64_FP_HOST; + vcpu->arch.fp_state = FP_STATE_HOST_OWNED; vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED; if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) @@ -98,9 +97,8 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED; - if (read_sysreg_s(SYS_SVCR) & - (SVCR_SM_MASK | SVCR_ZA_MASK)) { - vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; + if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { + vcpu->arch.fp_state = FP_STATE_FREE; fpsimd_save_and_flush_cpu_state(); } } @@ -119,7 +117,7 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) { if (!system_supports_fpsimd() || test_thread_flag(TIF_FOREIGN_FPSTATE)) - vcpu->arch.flags &= ~(KVM_ARM64_FP_ENABLED | KVM_ARM64_FP_HOST); + vcpu->arch.fp_state = FP_STATE_FREE; } /* @@ -133,7 +131,7 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) { WARN_ON_ONCE(!irqs_disabled()); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { /* * Currently we do not support SME guests so SVCR is * always 0 and we just need a variable to point to. @@ -176,7 +174,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) CPACR_EL1_SMEN_EL1EN); } - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) { if (vcpu_has_sve(vcpu)) { __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e54320384943..6cbbb6c02f66 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -40,7 +40,7 @@ extern struct kvm_exception_table_entry __stop___kvm_ex_table; /* Check whether the FP regs are owned by the guest */ static inline bool guest_owns_fp_regs(struct kvm_vcpu *vcpu) { - return !!(vcpu->arch.flags & KVM_ARM64_FP_ENABLED); + return vcpu->arch.fp_state == FP_STATE_GUEST_OWNED; } /* Save the 32-bit only FPSIMD system register state */ @@ -179,10 +179,8 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) isb(); /* Write out the host state if it's in the registers */ - if (vcpu->arch.flags & KVM_ARM64_FP_HOST) { + if (vcpu->arch.fp_state == FP_STATE_HOST_OWNED) __fpsimd_save_state(vcpu->arch.host_fpsimd_state); - vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; - } /* Restore the guest state */ if (sve_guest) @@ -194,7 +192,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) if (!(read_sysreg(hcr_el2) & HCR_RW)) write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2); - vcpu->arch.flags |= KVM_ARM64_FP_ENABLED; + vcpu->arch.fp_state = FP_STATE_GUEST_OWNED; return true; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index a6b9f1186577..764bdc423cb8 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -123,7 +123,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) } cptr = CPTR_EL2_DEFAULT; - if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)) + if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) cptr |= CPTR_EL2_TZ; if (cpus_have_final_cap(ARM64_SME)) cptr &= ~CPTR_EL2_TSM; @@ -335,7 +335,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg_restore_state_nvhe(host_ctxt); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 46f365254e9f..bce7fc51f9a1 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -175,7 +175,7 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) sysreg_restore_host_state_vhe(host_ctxt); - if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) + if (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED) __fpsimd_save_fpexc32(vcpu); __debug_switch_to_host(vcpu); From e87abb73e5946379896cf49b10f6b57e02937a4c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:15 +0100 Subject: [PATCH 03/69] KVM: arm64: Add helpers to manipulate vcpu flags among a set Careful analysis of the vcpu flags show that this is a mix of configuration, communication between the host and the hypervisor, as well as anciliary state that has no consistency. It'd be a lot better if we could split these flags into consistent categories. However, even if we split these flags apart, we want to make sure that each flag can only be applied to its own set, and not across sets. To achieve this, use a preprocessor hack so that each flag is always associated with: - the set that contains it, - a mask that describe all the bits that contain it (for a simple flag, this is the same thing as the flag itself, but we will eventually have values that cover multiple bits at once). Each flag is thus a triplet that is not directly usable as a value, but used by three helpers that allow the flag to be set, cleared, and fetched. By mandating the use of such helper, we can easily enforce that a flag can only be used with the set it belongs to. Finally, one last helper "unpacks" the raw value from the triplet that represents a flag, which is useful for multi-bit values that need to be enumerated (in a switch statement, for example). Further patches will start making use of this infrastructure. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 372c5642cfab..6d30ac7e3164 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -415,6 +415,50 @@ struct kvm_vcpu_arch { } steal; }; +/* + * Each 'flag' is composed of a comma-separated triplet: + * + * - the flag-set it belongs to in the vcpu->arch structure + * - the value for that flag + * - the mask for that flag + * + * __vcpu_single_flag() builds such a triplet for a single-bit flag. + * unpack_vcpu_flag() extract the flag value from the triplet for + * direct use outside of the flag accessors. + */ +#define __vcpu_single_flag(_set, _f) _set, (_f), (_f) + +#define __unpack_flag(_set, _f, _m) _f +#define unpack_vcpu_flag(...) __unpack_flag(__VA_ARGS__) + +#define __vcpu_get_flag(v, flagset, f, m) \ + ({ \ + v->arch.flagset & (m); \ + }) + +#define __vcpu_set_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + fset = &v->arch.flagset; \ + if (HWEIGHT(m) > 1) \ + *fset &= ~(m); \ + *fset |= (f); \ + } while (0) + +#define __vcpu_clear_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *fset; \ + \ + fset = &v->arch.flagset; \ + *fset &= ~(m); \ + } while (0) + +#define vcpu_get_flag(v, ...) __vcpu_get_flag((v), __VA_ARGS__) +#define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) +#define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) + + /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl)) From 690bacb83bc30d14821bd32cac1c5839b4a9ac6c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:16 +0100 Subject: [PATCH 04/69] KVM: arm64: Add three sets of flags to the vcpu state It so appears that each of the vcpu flags is really belonging to one of three categories: - a configuration flag, set once and for all - an input flag generated by the kernel for the hypervisor to use - a state flag that is only for the kernel's own bookkeeping As we are going to split all the existing flags into these three sets, introduce all three in one go. No functional change other than a bit of bloat... Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6d30ac7e3164..af45320f247f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -335,6 +335,15 @@ struct kvm_vcpu_arch { /* Miscellaneous vcpu state flags */ u64 flags; + /* Configuration flags, set once and for all before the vcpu can run */ + u64 cflags; + + /* Input flags to the hypervisor code, potentially cleared after use */ + u64 iflags; + + /* State flags for kernel bookkeeping, unused by the hypervisor code */ + u64 sflags; + /* * We maintain more than a single set of debug registers to support * debugging the guest from the host and to maintain separate host and From 4c0680d394d8a77868049931101e4a59372346b5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:17 +0100 Subject: [PATCH 05/69] KVM: arm64: Move vcpu configuration flags into their own set The KVM_ARM64_{GUEST_HAS_SVE,VCPU_SVE_FINALIZED,GUEST_HAS_PTRAUTH} flags are purely configuration flags. Once set, they are never cleared, but evaluated all over the code base. Move these three flags into the configuration set in one go, using the new accessors, and take this opportunity to drop the KVM_ARM64_ prefix which doesn't provide any help. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 17 ++++++++++------- arch/arm64/kvm/reset.c | 6 +++--- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index af45320f247f..66a08b0e12a8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -467,6 +467,13 @@ struct kvm_vcpu_arch { #define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) #define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) +/* SVE exposed to guest */ +#define GUEST_HAS_SVE __vcpu_single_flag(cflags, BIT(0)) +/* SVE config completed */ +#define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) +/* PTRAUTH exposed to guest */ +#define GUEST_HAS_PTRAUTH __vcpu_single_flag(cflags, BIT(2)) + /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ @@ -491,9 +498,6 @@ struct kvm_vcpu_arch { /* vcpu_arch flags field values: */ #define KVM_ARM64_DEBUG_DIRTY (1 << 0) #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ -#define KVM_ARM64_GUEST_HAS_SVE (1 << 5) /* SVE exposed to guest */ -#define KVM_ARM64_VCPU_SVE_FINALIZED (1 << 6) /* SVE config completed */ -#define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ #define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ /* * Overlaps with KVM_ARM64_EXCEPT_MASK on purpose so that it can't be @@ -530,13 +534,13 @@ struct kvm_vcpu_arch { KVM_GUESTDBG_SINGLESTEP) #define vcpu_has_sve(vcpu) (system_supports_sve() && \ - ((vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_SVE)) + vcpu_get_flag(vcpu, GUEST_HAS_SVE)) #ifdef CONFIG_ARM64_PTR_AUTH #define vcpu_has_ptrauth(vcpu) \ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \ - (vcpu)->arch.flags & KVM_ARM64_GUEST_HAS_PTRAUTH) + vcpu_get_flag(vcpu, GUEST_HAS_PTRAUTH)) #else #define vcpu_has_ptrauth(vcpu) false #endif @@ -893,8 +897,7 @@ void kvm_init_protected_traps(struct kvm_vcpu *vcpu); int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); -#define kvm_arm_vcpu_sve_finalized(vcpu) \ - ((vcpu)->arch.flags & KVM_ARM64_VCPU_SVE_FINALIZED) +#define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED) #define kvm_has_mte(kvm) \ (system_supports_mte() && \ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 6c70c6f61c70..0e08fbe68715 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -81,7 +81,7 @@ static int kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ - vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_SVE; + vcpu_set_flag(vcpu, GUEST_HAS_SVE); return 0; } @@ -120,7 +120,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu) } vcpu->arch.sve_state = buf; - vcpu->arch.flags |= KVM_ARM64_VCPU_SVE_FINALIZED; + vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED); return 0; } @@ -177,7 +177,7 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) !system_has_full_ptr_auth()) return -EINVAL; - vcpu->arch.flags |= KVM_ARM64_GUEST_HAS_PTRAUTH; + vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); return 0; } From 699bb2e0c6f3796549dabac329501df7ffd99439 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:18 +0100 Subject: [PATCH 06/69] KVM: arm64: Move vcpu PC/Exception flags to the input flag set The PC update flags (which also deal with exception injection) is one of the most complicated use of the flag we have. Make it more fool prof by: - moving it over to the new accessors and assign it to the input flag set - turn the combination of generic ELx flags with another flag indicating the target EL itself into an explicit set of flags for each EL and vector combination - add a new accessor to pend the exception This is otherwise a pretty straightformward conversion. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 9 ++++- arch/arm64/include/asm/kvm_host.h | 58 ++++++++++++++++------------ arch/arm64/kvm/arm.c | 4 +- arch/arm64/kvm/hyp/exception.c | 23 ++++++----- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 4 +- arch/arm64/kvm/inject_fault.c | 17 +++----- 6 files changed, 61 insertions(+), 54 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 0e66edd3aff2..6ec58080ece8 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -473,9 +473,16 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { - vcpu->arch.flags |= KVM_ARM64_INCREMENT_PC; + vcpu_set_flag(vcpu, INCREMENT_PC); } +#define kvm_pend_exception(v, e) \ + do { \ + vcpu_set_flag((v), PENDING_EXCEPTION); \ + vcpu_set_flag((v), e); \ + } while (0) + + static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) { return test_bit(feature, vcpu->arch.features); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 66a08b0e12a8..db42b4c06449 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -474,6 +474,40 @@ struct kvm_vcpu_arch { /* PTRAUTH exposed to guest */ #define GUEST_HAS_PTRAUTH __vcpu_single_flag(cflags, BIT(2)) +/* Exception pending */ +#define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) +/* + * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't + * be set together with an exception... + */ +#define INCREMENT_PC __vcpu_single_flag(iflags, BIT(1)) +/* Target EL/MODE (not a single flag, but let's abuse the macro) */ +#define EXCEPT_MASK __vcpu_single_flag(iflags, GENMASK(3, 1)) + +/* Helpers to encode exceptions with minimum fuss */ +#define __EXCEPT_MASK_VAL unpack_vcpu_flag(EXCEPT_MASK) +#define __EXCEPT_SHIFT __builtin_ctzl(__EXCEPT_MASK_VAL) +#define __vcpu_except_flags(_f) iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL + +/* + * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following + * values: + * + * For AArch32 EL1: + */ +#define EXCEPT_AA32_UND __vcpu_except_flags(0) +#define EXCEPT_AA32_IABT __vcpu_except_flags(1) +#define EXCEPT_AA32_DABT __vcpu_except_flags(2) +/* For AArch64: */ +#define EXCEPT_AA64_EL1_SYNC __vcpu_except_flags(0) +#define EXCEPT_AA64_EL1_IRQ __vcpu_except_flags(1) +#define EXCEPT_AA64_EL1_FIQ __vcpu_except_flags(2) +#define EXCEPT_AA64_EL1_SERR __vcpu_except_flags(3) +/* For AArch64 with NV (one day): */ +#define EXCEPT_AA64_EL2_SYNC __vcpu_except_flags(4) +#define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) +#define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) +#define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ @@ -498,30 +532,6 @@ struct kvm_vcpu_arch { /* vcpu_arch flags field values: */ #define KVM_ARM64_DEBUG_DIRTY (1 << 0) #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ -#define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ -/* - * Overlaps with KVM_ARM64_EXCEPT_MASK on purpose so that it can't be - * set together with an exception... - */ -#define KVM_ARM64_INCREMENT_PC (1 << 9) /* Increment PC */ -#define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */ -/* - * When KVM_ARM64_PENDING_EXCEPTION is set, KVM_ARM64_EXCEPT_MASK can - * take the following values: - * - * For AArch32 EL1: - */ -#define KVM_ARM64_EXCEPT_AA32_UND (0 << 9) -#define KVM_ARM64_EXCEPT_AA32_IABT (1 << 9) -#define KVM_ARM64_EXCEPT_AA32_DABT (2 << 9) -/* For AArch64: */ -#define KVM_ARM64_EXCEPT_AA64_ELx_SYNC (0 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_IRQ (1 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_FIQ (2 << 9) -#define KVM_ARM64_EXCEPT_AA64_ELx_SERR (3 << 9) -#define KVM_ARM64_EXCEPT_AA64_EL1 (0 << 11) -#define KVM_ARM64_EXCEPT_AA64_EL2 (1 << 11) - #define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ #define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ #define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 400bb0fe2745..5beabbe69585 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1013,8 +1013,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * the vcpu state. Note that this relies on __kvm_adjust_pc() * being preempt-safe on VHE. */ - if (unlikely(vcpu->arch.flags & (KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_INCREMENT_PC))) + if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) || + vcpu_get_flag(vcpu, INCREMENT_PC))) kvm_call_hyp(__kvm_adjust_pc, vcpu); vcpu_put(vcpu); diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index c5d009715402..b7557b25ed56 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -303,14 +303,14 @@ static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) static void kvm_inject_exception(struct kvm_vcpu *vcpu) { if (vcpu_el1_is_32bit(vcpu)) { - switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { - case KVM_ARM64_EXCEPT_AA32_UND: + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA32_UND): enter_exception32(vcpu, PSR_AA32_MODE_UND, 4); break; - case KVM_ARM64_EXCEPT_AA32_IABT: + case unpack_vcpu_flag(EXCEPT_AA32_IABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12); break; - case KVM_ARM64_EXCEPT_AA32_DABT: + case unpack_vcpu_flag(EXCEPT_AA32_DABT): enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16); break; default: @@ -318,9 +318,8 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) break; } } else { - switch (vcpu->arch.flags & KVM_ARM64_EXCEPT_MASK) { - case (KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_EXCEPT_AA64_EL1): + switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) { + case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC): enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync); break; default: @@ -340,12 +339,12 @@ static void kvm_inject_exception(struct kvm_vcpu *vcpu) */ void __kvm_adjust_pc(struct kvm_vcpu *vcpu) { - if (vcpu->arch.flags & KVM_ARM64_PENDING_EXCEPTION) { + if (vcpu_get_flag(vcpu, PENDING_EXCEPTION)) { kvm_inject_exception(vcpu); - vcpu->arch.flags &= ~(KVM_ARM64_PENDING_EXCEPTION | - KVM_ARM64_EXCEPT_MASK); - } else if (vcpu->arch.flags & KVM_ARM64_INCREMENT_PC) { + vcpu_clear_flag(vcpu, PENDING_EXCEPTION); + vcpu_clear_flag(vcpu, EXCEPT_MASK); + } else if (vcpu_get_flag(vcpu, INCREMENT_PC)) { kvm_skip_instr(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_INCREMENT_PC; + vcpu_clear_flag(vcpu, INCREMENT_PC); } } diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index b6d86e423319..edd3eabf520f 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -38,9 +38,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR); *vcpu_cpsr(vcpu) = read_sysreg_el2(SYS_SPSR); - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); __kvm_adjust_pc(vcpu); diff --git a/arch/arm64/kvm/inject_fault.c b/arch/arm64/kvm/inject_fault.c index 55a5dbe957e0..f32f4a2a347f 100644 --- a/arch/arm64/kvm/inject_fault.c +++ b/arch/arm64/kvm/inject_fault.c @@ -20,9 +20,7 @@ static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr bool is_aarch32 = vcpu_mode_is_32bit(vcpu); u64 esr = 0; - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); vcpu_write_sys_reg(vcpu, addr, FAR_EL1); @@ -52,9 +50,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) { u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT); - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA64_EL1 | - KVM_ARM64_EXCEPT_AA64_ELx_SYNC | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC); /* * Build an unknown exception, depending on the instruction @@ -73,8 +69,7 @@ static void inject_undef64(struct kvm_vcpu *vcpu) static void inject_undef32(struct kvm_vcpu *vcpu) { - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_UND | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_UND); } /* @@ -97,14 +92,12 @@ static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr) far = vcpu_read_sys_reg(vcpu, FAR_EL1); if (is_pabt) { - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_IABT | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_IABT); far &= GENMASK(31, 0); far |= (u64)addr << 32; vcpu_write_sys_reg(vcpu, fsr, IFSR32_EL2); } else { /* !iabt */ - vcpu->arch.flags |= (KVM_ARM64_EXCEPT_AA32_DABT | - KVM_ARM64_PENDING_EXCEPTION); + kvm_pend_exception(vcpu, EXCEPT_AA32_DABT); far &= GENMASK(63, 32); far |= addr; vcpu_write_sys_reg(vcpu, fsr, ESR_EL1); From 802b91118d11227b527153849ea761b280691373 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 23 May 2022 16:51:51 +0200 Subject: [PATCH 07/69] arm64: kasan: do not instrument stacktrace.c Disable KASAN instrumentation of arch/arm64/kernel/stacktrace.c. This speeds up Generic KASAN by 5-20%. As a side-effect, KASAN is now unable to detect bugs in the stack trace collection code. This is taken as an acceptable downside. Also replace READ_ONCE_NOCHECK() with READ_ONCE() in stacktrace.c. As the file is now not instrumented, there is no need to use the NOCHECK version of READ_ONCE(). Suggested-by: Mark Rutland Acked-by: Mark Rutland Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/c4c944a2a905e949760fbeb29258185087171708.1653317461.git.andreyknvl@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/Makefile | 5 +++++ arch/arm64/kernel/stacktrace.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fa7981d0d917..7075a9c6a4a6 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong CFLAGS_syscall.o += -fno-stack-protector +# When KASAN is enabled, a stack trace is recorded for every alloc/free, which +# can significantly impact performance. Avoid instrumenting the stack trace +# collection code to minimize this impact. +KASAN_SANITIZE_stacktrace.o := n + # It's not safe to invoke KCOV when portions of the kernel environment aren't # available or are out-of-sync with HW state. Since `noinstr` doesn't always # inhibit KCOV instrumentation, disable it for the entire compilation unit. diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 0467cb79f080..c246e8d9f95b 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -124,8 +124,8 @@ static int notrace unwind_next(struct task_struct *tsk, * Record this frame record's values and location. The prev_fp and * prev_type are only meaningful to the next unwind_next() invocation. */ - state->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); - state->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); + state->fp = READ_ONCE(*(unsigned long *)(fp)); + state->pc = READ_ONCE(*(unsigned long *)(fp + 8)); state->prev_fp = fp; state->prev_type = info.type; From 446297b28a21244e4045026c4599d1b14a67e2ce Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 23 May 2022 16:51:52 +0200 Subject: [PATCH 08/69] arm64: stacktrace: use non-atomic __set_bit Use the non-atomic version of set_bit() in arch/arm64/kernel/stacktrace.c, as there is no concurrent accesses to frame->prev_type. This speeds up stack trace collection and improves the boot time of Generic KASAN by 2-5%. Suggested-by: Mark Rutland Acked-by: Mark Rutland Signed-off-by: Andrey Konovalov Link: https://lore.kernel.org/r/23dfa36d1cc91e4a1059945b7834eac22fb9854d.1653317461.git.andreyknvl@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index c246e8d9f95b..d6bef106e37e 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -117,7 +117,7 @@ static int notrace unwind_next(struct task_struct *tsk, if (fp <= state->prev_fp) return -EINVAL; } else { - set_bit(state->prev_type, state->stacks_done); + __set_bit(state->prev_type, state->stacks_done); } /* From a019d8a2cc82a95880677fb0ec16d1d4e8647df7 Mon Sep 17 00:00:00 2001 From: "Madhavan T. Venkataraman" Date: Fri, 17 Jun 2022 13:02:14 -0500 Subject: [PATCH 09/69] arm64: Split unwind_init() unwind_init() is currently a single function that initializes all of the unwind state. Split it into the following functions and call them appropriately: - unwind_init_from_regs() - initialize from regs passed by caller. - unwind_init_from_caller() - initialize for the current task from the caller of arch_stack_walk(). - unwind_init_from_task() - initialize from the saved state of a task other than the current task. In this case, the other task must not be running. This is done for two reasons: - the different ways of initializing are clear - specialized code can be added to each initializer in the future. Signed-off-by: Madhavan T. Venkataraman Reviewed-by: Mark Brown Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220617180219.20352-2-madvenka@linux.microsoft.com Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 66 ++++++++++++++++++++++++++++------ 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d6bef106e37e..91934cabbe8b 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -50,11 +50,8 @@ struct unwind_state { #endif }; -static notrace void unwind_init(struct unwind_state *state, unsigned long fp, - unsigned long pc) +static void unwind_init_common(struct unwind_state *state) { - state->fp = fp; - state->pc = pc; #ifdef CONFIG_KRETPROBES state->kr_cur = NULL; #endif @@ -72,7 +69,57 @@ static notrace void unwind_init(struct unwind_state *state, unsigned long fp, state->prev_fp = 0; state->prev_type = STACK_TYPE_UNKNOWN; } -NOKPROBE_SYMBOL(unwind_init); + +/* + * Start an unwind from a pt_regs. + * + * The unwind will begin at the PC within the regs. + * + * The regs must be on a stack currently owned by the calling task. + */ +static inline void unwind_init_from_regs(struct unwind_state *state, + struct pt_regs *regs) +{ + unwind_init_common(state); + + state->fp = regs->regs[29]; + state->pc = regs->pc; +} + +/* + * Start an unwind from a caller. + * + * The unwind will begin at the caller of whichever function this is inlined + * into. + * + * The function which invokes this must be noinline. + */ +static __always_inline void unwind_init_from_caller(struct unwind_state *state) +{ + unwind_init_common(state); + + state->fp = (unsigned long)__builtin_frame_address(1); + state->pc = (unsigned long)__builtin_return_address(0); +} + +/* + * Start an unwind from a blocked task. + * + * The unwind will begin at the blocked tasks saved PC (i.e. the caller of + * cpu_switch_to()). + * + * The caller should ensure the task is blocked in cpu_switch_to() for the + * duration of the unwind, or the unwind will be bogus. It is never valid to + * call this for the current task. + */ +static inline void unwind_init_from_task(struct unwind_state *state, + struct task_struct *task) +{ + unwind_init_common(state); + + state->fp = thread_saved_fp(task); + state->pc = thread_saved_pc(task); +} /* * Unwind from one frame record (A) to the next frame record (B). @@ -213,14 +260,11 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, struct unwind_state state; if (regs) - unwind_init(&state, regs->regs[29], regs->pc); + unwind_init_from_regs(&state, regs); else if (task == current) - unwind_init(&state, - (unsigned long)__builtin_frame_address(1), - (unsigned long)__builtin_return_address(0)); + unwind_init_from_caller(&state); else - unwind_init(&state, thread_saved_fp(task), - thread_saved_pc(task)); + unwind_init_from_task(&state, task); unwind(task, &state, consume_entry, cookie); } From 82a592c13b0aeff94d84d54183dae0b26384c95f Mon Sep 17 00:00:00 2001 From: "Madhavan T. Venkataraman" Date: Fri, 17 Jun 2022 13:02:15 -0500 Subject: [PATCH 10/69] arm64: Copy the task argument to unwind_state Copy the task argument passed to arch_stack_walk() to unwind_state so that it can be passed to unwind functions via unwind_state rather than as a separate argument. The task is a fundamental part of the unwind state. Signed-off-by: Madhavan T. Venkataraman Reviewed-by: Mark Brown Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20220617180219.20352-3-madvenka@linux.microsoft.com Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 91934cabbe8b..fcaa151b81f1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -38,6 +38,8 @@ * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance * associated with the most recently encountered replacement lr * value. + * + * @task: The task being unwound. */ struct unwind_state { unsigned long fp; @@ -48,10 +50,13 @@ struct unwind_state { #ifdef CONFIG_KRETPROBES struct llist_node *kr_cur; #endif + struct task_struct *task; }; -static void unwind_init_common(struct unwind_state *state) +static void unwind_init_common(struct unwind_state *state, + struct task_struct *task) { + state->task = task; #ifdef CONFIG_KRETPROBES state->kr_cur = NULL; #endif @@ -80,7 +85,7 @@ static void unwind_init_common(struct unwind_state *state) static inline void unwind_init_from_regs(struct unwind_state *state, struct pt_regs *regs) { - unwind_init_common(state); + unwind_init_common(state, current); state->fp = regs->regs[29]; state->pc = regs->pc; @@ -96,7 +101,7 @@ static inline void unwind_init_from_regs(struct unwind_state *state, */ static __always_inline void unwind_init_from_caller(struct unwind_state *state) { - unwind_init_common(state); + unwind_init_common(state, current); state->fp = (unsigned long)__builtin_frame_address(1); state->pc = (unsigned long)__builtin_return_address(0); @@ -115,7 +120,7 @@ static __always_inline void unwind_init_from_caller(struct unwind_state *state) static inline void unwind_init_from_task(struct unwind_state *state, struct task_struct *task) { - unwind_init_common(state); + unwind_init_common(state, task); state->fp = thread_saved_fp(task); state->pc = thread_saved_pc(task); @@ -128,9 +133,9 @@ static inline void unwind_init_from_task(struct unwind_state *state, * records (e.g. a cycle), determined based on the location and fp value of A * and the location (but not the fp value) of B. */ -static int notrace unwind_next(struct task_struct *tsk, - struct unwind_state *state) +static int notrace unwind_next(struct unwind_state *state) { + struct task_struct *tsk = state->task; unsigned long fp = state->fp; struct stack_info info; @@ -204,8 +209,7 @@ static int notrace unwind_next(struct task_struct *tsk, } NOKPROBE_SYMBOL(unwind_next); -static void notrace unwind(struct task_struct *tsk, - struct unwind_state *state, +static void notrace unwind(struct unwind_state *state, stack_trace_consume_fn consume_entry, void *cookie) { while (1) { @@ -213,7 +217,7 @@ static void notrace unwind(struct task_struct *tsk, if (!consume_entry(cookie, state->pc)) break; - ret = unwind_next(tsk, state); + ret = unwind_next(state); if (ret < 0) break; } @@ -259,12 +263,15 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, { struct unwind_state state; - if (regs) + if (regs) { + if (task != current) + return; unwind_init_from_regs(&state, regs); - else if (task == current) + } else if (task == current) { unwind_init_from_caller(&state); - else + } else { unwind_init_from_task(&state, task); + } - unwind(task, &state, consume_entry, cookie); + unwind(&state, consume_entry, cookie); } From b1da49088ac68a21c613efd734dada8272ec0b00 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:19 +0100 Subject: [PATCH 11/69] KVM: arm64: Move vcpu debug/SPE/TRBE flags to the input flag set The three debug flags (which deal with the debug registers, SPE and TRBE) all are input flags to the hypervisor code. Move them into the input set and convert them to the new accessors. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 9 +++++--- arch/arm64/kvm/debug.c | 25 +++++++++++----------- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 6 +++--- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 4 ++-- arch/arm64/kvm/hyp/nvhe/debug-sr.c | 8 +++---- arch/arm64/kvm/sys_regs.c | 8 +++---- 6 files changed, 31 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index db42b4c06449..4c7446400b77 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -508,6 +508,12 @@ struct kvm_vcpu_arch { #define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) #define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) #define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) +/* Guest debug is live */ +#define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) +/* Save SPE context if active */ +#define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5)) +/* Save TRBE context if active */ +#define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ @@ -530,10 +536,7 @@ struct kvm_vcpu_arch { }) /* vcpu_arch flags field values: */ -#define KVM_ARM64_DEBUG_DIRTY (1 << 0) #define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ -#define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ -#define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ #define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ #define KVM_ARM64_HOST_SME_ENABLED (1 << 16) /* SME enabled for EL0 */ #define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 4fd5c216c4bb..0b28d7db7c76 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -104,11 +104,11 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) * Trap debug register access when one of the following is true: * - Userspace is using the hardware to debug the guest * (KVM_GUESTDBG_USE_HW is set). - * - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear). + * - The guest is not using debug (DEBUG_DIRTY clear). * - The guest has enabled the OS Lock (debug exceptions are blocked). */ if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || - !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) || + !vcpu_get_flag(vcpu, DEBUG_DIRTY) || kvm_vcpu_os_lock_enabled(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; @@ -147,8 +147,8 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) * debug related registers. * * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY - * flag on vcpu->arch.flags). Since the guest must not interfere + * the guest is not actively using them (see the DEBUG_DIRTY + * flag on vcpu->arch.iflags). Since the guest must not interfere * with the hardware state when debugging the guest, we must ensure that * trapping is enabled whenever we are debugging the guest using the * debug registers. @@ -205,9 +205,8 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) * * We simply switch the debug_ptr to point to our new * external_debug_state which has been populated by the - * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY - * mechanism ensures the registers are updated on the - * world switch. + * debug ioctl. The existing DEBUG_DIRTY mechanism ensures + * the registers are updated on the world switch. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { /* Enable breakpoints/watchpoints */ @@ -216,7 +215,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); trace_kvm_arm_set_regset("BKPTS", get_num_brps(), &vcpu->arch.debug_ptr->dbg_bcr[0], @@ -246,7 +245,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) /* If KDE or MDE are set, perform a full save/restore cycle. */ if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); /* Write mdcr_el2 changes since vcpu_load on VHE systems */ if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) @@ -298,16 +297,16 @@ void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) */ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) && !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT))) - vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE; + vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE); /* Check if we have TRBE implemented and available at the host */ if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) && !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG)) - vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE; + vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE); } void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) { - vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE | - KVM_ARM64_DEBUG_STATE_SAVE_TRBE); + vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); + vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); } diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 4ebe9f558f3a..961bbef104a6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -132,7 +132,7 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) return; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; @@ -151,7 +151,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) + if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) return; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; @@ -162,7 +162,7 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - vcpu->arch.flags &= ~KVM_ARM64_DEBUG_DIRTY; + vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index 7ecca8b07851..baa5b9b3dde5 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -195,7 +195,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); - if (has_vhe() || vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) + if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); } @@ -212,7 +212,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY) + if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index df361d839902..e17455773b98 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -84,10 +84,10 @@ static void __debug_restore_trace(u64 trfcr_el1) void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); /* Disable and flush Self-Hosted Trace generation */ - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1); } @@ -98,9 +98,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); - if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) + if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c06c0477fab5..f24797c57df8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -387,7 +387,7 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, { if (p->is_write) { vcpu_write_sys_reg(vcpu, p->regval, r->reg); - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); } else { p->regval = vcpu_read_sys_reg(vcpu, r->reg); } @@ -403,8 +403,8 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits * - * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the - * hyp.S code switches between host and guest values in future. + * All writes will set the DEBUG_DIRTY flag to ensure the hyp code + * switches between host and guest values in future. */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -420,7 +420,7 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; - vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; + vcpu_set_flag(vcpu, DEBUG_DIRTY); } static void dbg_to_reg(struct kvm_vcpu *vcpu, From 0affa37fcd1d6f701a0fe805c4ceb7f348d377d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:20 +0100 Subject: [PATCH 12/69] KVM: arm64: Move vcpu SVE/SME flags to the state flag set The two HOST_{SVE,SME}_ENABLED are only used for the host kernel to track its own state across a vcpu run so that it can be fully restored. Move these flags to the so called state set. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 8 +++++--- arch/arm64/kvm/fpsimd.c | 12 ++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4c7446400b77..4f147bdc5ce9 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -515,6 +515,11 @@ struct kvm_vcpu_arch { /* Save TRBE context if active */ #define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) +/* SVE enabled for host EL0 */ +#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) +/* SME enabled for EL0 */ +#define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) + /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ sve_ffr_offset((vcpu)->arch.sve_max_vl)) @@ -536,11 +541,8 @@ struct kvm_vcpu_arch { }) /* vcpu_arch flags field values: */ -#define KVM_ARM64_HOST_SVE_ENABLED (1 << 4) /* SVE enabled for EL0 */ #define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ -#define KVM_ARM64_HOST_SME_ENABLED (1 << 16) /* SME enabled for EL0 */ #define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ - #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index d397efe1a378..557a96f8e06a 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -79,9 +79,9 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) vcpu->arch.fp_state = FP_STATE_HOST_OWNED; - vcpu->arch.flags &= ~KVM_ARM64_HOST_SVE_ENABLED; + vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - vcpu->arch.flags |= KVM_ARM64_HOST_SVE_ENABLED; + vcpu_set_flag(vcpu, HOST_SVE_ENABLED); /* * We don't currently support SME guests but if we leave @@ -93,9 +93,9 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) * operations. Do this for ZA as well for now for simplicity. */ if (system_supports_sme()) { - vcpu->arch.flags &= ~KVM_ARM64_HOST_SME_ENABLED; + vcpu_clear_flag(vcpu, HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - vcpu->arch.flags |= KVM_ARM64_HOST_SME_ENABLED; + vcpu_set_flag(vcpu, HOST_SME_ENABLED); if (read_sysreg_s(SYS_SVCR) & (SVCR_SM_MASK | SVCR_ZA_MASK)) { vcpu->arch.fp_state = FP_STATE_FREE; @@ -164,7 +164,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) */ if (has_vhe() && system_supports_sme()) { /* Also restore EL0 state seen on entry */ - if (vcpu->arch.flags & KVM_ARM64_HOST_SME_ENABLED) + if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_SMEN_EL0EN | CPACR_EL1_SMEN_EL1EN); @@ -193,7 +193,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * for EL0. To avoid spurious traps, restore the trap state * seen by kvm_arch_vcpu_load_fp(): */ - if (vcpu->arch.flags & KVM_ARM64_HOST_SVE_ENABLED) + if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); From aff3ccd7320eed5814d317fcb80244f474d66a84 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:21 +0100 Subject: [PATCH 13/69] KVM: arm64: Move vcpu ON_UNSUPPORTED_CPU flag to the state flag set The ON_UNSUPPORTED_CPU flag is only there to track the sad fact that we have ended-up on a CPU where we cannot really run. Since this is only for the host kernel's use, move it to the state set. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4f147bdc5ce9..0c22514cb7c7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -519,6 +519,8 @@ struct kvm_vcpu_arch { #define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) /* SME enabled for EL0 */ #define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) +/* Physical CPU not in supported_cpus */ +#define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ @@ -541,7 +543,6 @@ struct kvm_vcpu_arch { }) /* vcpu_arch flags field values: */ -#define KVM_ARM64_ON_UNSUPPORTED_CPU (1 << 15) /* Physical CPU not in supported_cpus */ #define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ @@ -561,13 +562,13 @@ struct kvm_vcpu_arch { #endif #define vcpu_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags & KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_set_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags |= KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_clear_on_unsupported_cpu(vcpu) \ - ((vcpu)->arch.flags &= ~KVM_ARM64_ON_UNSUPPORTED_CPU) + vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU) #define vcpu_gp_regs(v) (&(v)->arch.ctxt.regs) From eebc538d8e07e0ec759823664cbe2011a8bd885d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:22 +0100 Subject: [PATCH 14/69] KVM: arm64: Move vcpu WFIT flag to the state flag set The host kernel uses the WFIT flag to remember that a vcpu has used this instruction and wake it up as required. Move it to the state set, as nothing in the hypervisor uses this information. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 4 ++-- arch/arm64/kvm/arch_timer.c | 2 +- arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/handle_exit.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0c22514cb7c7..0fb1a5b86f16 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -521,6 +521,8 @@ struct kvm_vcpu_arch { #define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) /* Physical CPU not in supported_cpus */ #define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) +/* WFIT instruction trapped */ +#define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ @@ -542,8 +544,6 @@ struct kvm_vcpu_arch { __size_ret; \ }) -/* vcpu_arch flags field values: */ -#define KVM_ARM64_WFIT (1 << 17) /* WFIT instruction trapped */ #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_USE_HW | \ diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 4e39ace073af..5290ca5db663 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -242,7 +242,7 @@ static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx) static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu) { return (cpus_have_final_cap(ARM64_HAS_WFXT) && - (vcpu->arch.flags & KVM_ARM64_WFIT)); + vcpu_get_flag(vcpu, IN_WFIT)); } static u64 wfit_delay_ns(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 5beabbe69585..8b9da9d30485 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -657,7 +657,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_enable(); kvm_vcpu_halt(vcpu); - vcpu->arch.flags &= ~KVM_ARM64_WFIT; + vcpu_clear_flag(vcpu, IN_WFIT); kvm_clear_request(KVM_REQ_UNHALT, vcpu); preempt_disable(); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index f66c0142b335..d045f5b973b9 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -120,7 +120,7 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); } else { if (esr & ESR_ELx_WFx_ISS_WFxT) - vcpu->arch.flags |= KVM_ARM64_WFIT; + vcpu_set_flag(vcpu, IN_WFIT); kvm_vcpu_wfi(vcpu); } From 781e3ae148fd2f9b0cf9b5b94f6c32f2361eb7c0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:23 +0100 Subject: [PATCH 15/69] KVM: arm64: Kill unused vcpu flags field Horray, we have now sorted all the preexisting flags, and the 'flags' field is now unused. Get rid of it while nobody is looking. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 0fb1a5b86f16..39da28f85045 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -332,9 +332,6 @@ struct kvm_vcpu_arch { FP_STATE_GUEST_OWNED, } fp_state; - /* Miscellaneous vcpu state flags */ - u64 flags; - /* Configuration flags, set once and for all before the vcpu can run */ u64 cflags; From 30b6ab45f81334e83dcb440451b6a4c4a753a118 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:24 +0100 Subject: [PATCH 16/69] KVM: arm64: Convert vcpu sysregs_loaded_on_cpu to a state flag The aptly named boolean 'sysregs_loaded_on_cpu' tracks whether some of the vcpu system registers are resident on the physical CPU when running in VHE mode. This is obviously a flag in hidding, so let's convert it to a state flag, since this is solely a host concern (the hypervisor itself always knows which state we're in). Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 6 ++---- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 4 ++-- arch/arm64/kvm/sys_regs.c | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 39da28f85045..ffbeb5f5692e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -410,10 +410,6 @@ struct kvm_vcpu_arch { /* Additional reset state */ struct vcpu_reset_state reset_state; - /* True when deferrable sysregs are loaded on the physical CPU, - * see kvm_vcpu_load_sysregs_vhe and kvm_vcpu_put_sysregs_vhe. */ - bool sysregs_loaded_on_cpu; - /* Guest PV state */ struct { u64 last_steal; @@ -520,6 +516,8 @@ struct kvm_vcpu_arch { #define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) /* WFIT instruction trapped */ #define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) +/* vcpu system registers loaded on physical CPU */ +#define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 007a12dd4351..7b44f6b3b547 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -79,7 +79,7 @@ void kvm_vcpu_load_sysregs_vhe(struct kvm_vcpu *vcpu) __sysreg_restore_user_state(guest_ctxt); __sysreg_restore_el1_state(guest_ctxt); - vcpu->arch.sysregs_loaded_on_cpu = true; + vcpu_set_flag(vcpu, SYSREGS_ON_CPU); activate_traps_vhe_load(vcpu); } @@ -110,5 +110,5 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) /* Restore host user state */ __sysreg_restore_user_state(host_ctxt); - vcpu->arch.sysregs_loaded_on_cpu = false; + vcpu_clear_flag(vcpu, SYSREGS_ON_CPU); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f24797c57df8..1c562bcfeccf 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -72,7 +72,7 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) { u64 val = 0x8badf00d8badf00d; - if (vcpu->arch.sysregs_loaded_on_cpu && + if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && __vcpu_read_sys_reg_from_cpu(reg, &val)) return val; @@ -81,7 +81,7 @@ u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg) void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) { - if (vcpu->arch.sysregs_loaded_on_cpu && + if (vcpu_get_flag(vcpu, SYSREGS_ON_CPU) && __vcpu_write_sys_reg_to_cpu(val, reg)) return; From e19f2c6cd14668c0d5b1cef280632b7ca5893118 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:25 +0100 Subject: [PATCH 17/69] KVM: arm64: Warn when PENDING_EXCEPTION and INCREMENT_PC are set together We really don't want PENDING_EXCEPTION and INCREMENT_PC to ever be set at the same time, as they are mutually exclusive. Add checks that will generate a warning should this ever happen. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 6ec58080ece8..9bdba47f7e14 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -473,11 +473,13 @@ static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) { + WARN_ON(vcpu_get_flag(vcpu, PENDING_EXCEPTION)); vcpu_set_flag(vcpu, INCREMENT_PC); } #define kvm_pend_exception(v, e) \ do { \ + WARN_ON(vcpu_get_flag((v), INCREMENT_PC)); \ vcpu_set_flag((v), PENDING_EXCEPTION); \ vcpu_set_flag((v), e); \ } while (0) From 5a3984f4ec73d1c7cf31a4cee46cca7d4c75deee Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:26 +0100 Subject: [PATCH 18/69] KVM: arm64: Add build-time sanity checks for flags Flags are great, but flags can also be dangerous: it is easy to encode a flag that is bigger than its container (unless the container is a u64), and it is easy to construct a flag value that doesn't fit in the mask that is associated with it. Add a couple of build-time sanity checks that ensure we catch these two cases. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ffbeb5f5692e..6a37018f40b7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -433,8 +433,20 @@ struct kvm_vcpu_arch { #define __unpack_flag(_set, _f, _m) _f #define unpack_vcpu_flag(...) __unpack_flag(__VA_ARGS__) +#define __build_check_flag(v, flagset, f, m) \ + do { \ + typeof(v->arch.flagset) *_fset; \ + \ + /* Check that the flags fit in the mask */ \ + BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m))); \ + /* Check that the flags fit in the type */ \ + BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m)); \ + } while (0) + #define __vcpu_get_flag(v, flagset, f, m) \ ({ \ + __build_check_flag(v, flagset, f, m); \ + \ v->arch.flagset & (m); \ }) @@ -442,6 +454,8 @@ struct kvm_vcpu_arch { do { \ typeof(v->arch.flagset) *fset; \ \ + __build_check_flag(v, flagset, f, m); \ + \ fset = &v->arch.flagset; \ if (HWEIGHT(m) > 1) \ *fset &= ~(m); \ @@ -452,6 +466,8 @@ struct kvm_vcpu_arch { do { \ typeof(v->arch.flagset) *fset; \ \ + __build_check_flag(v, flagset, f, m); \ + \ fset = &v->arch.flagset; \ *fset &= ~(m); \ } while (0) From 54ddda919c4bc37c113727034619c4e15c184334 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:27 +0100 Subject: [PATCH 19/69] KVM: arm64: Reduce the size of the vcpu flag members Now that we can detect flags overflowing their container, reduce the size of all flag set members in the vcpu struct, turning them into 8bit quantities. Even with the FP state enum occupying 32bit, the whole of the state that was represented by flags is smaller by one byte. Profit! Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 6a37018f40b7..c6975ecf5a5f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -333,13 +333,13 @@ struct kvm_vcpu_arch { } fp_state; /* Configuration flags, set once and for all before the vcpu can run */ - u64 cflags; + u8 cflags; /* Input flags to the hypervisor code, potentially cleared after use */ - u64 iflags; + u8 iflags; /* State flags for kernel bookkeeping, unused by the hypervisor code */ - u64 sflags; + u8 sflags; /* * We maintain more than a single set of debug registers to support From 0fa4a3137e943cd6acab386ff26cd8d5e94e9559 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 28 May 2022 12:38:28 +0100 Subject: [PATCH 20/69] KVM: arm64: Document why pause cannot be turned into a flag It would be tempting to turn the 'pause' state into a flag. However, this cannot easily be done as it is updated out of context, while all the flags expect to only be updated from the vcpu thread. Turning it into a flag would require to make all flag updates atomic, which isn't necessary desireable. Document this, and take this opportunity to move the field next to the flag sets, filling a hole in the vcpu structure. Reviewed-by: Fuad Tabba Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c6975ecf5a5f..2cc42e1fec18 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -341,6 +341,15 @@ struct kvm_vcpu_arch { /* State flags for kernel bookkeeping, unused by the hypervisor code */ u8 sflags; + /* + * Don't run the guest (internal implementation need). + * + * Contrary to the flags above, this is set/cleared outside of + * a vcpu context, and thus cannot be mixed with the flags + * themselves (or the flag accesses need to be made atomic). + */ + bool pause; + /* * We maintain more than a single set of debug registers to support * debugging the guest from the host and to maintain separate host and @@ -394,9 +403,6 @@ struct kvm_vcpu_arch { /* vcpu power state */ struct kvm_mp_state mp_state; - /* Don't run the guest (internal implementation need) */ - bool pause; - /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; From b4da91879e98bdd5998ee84f47f02426ac50a729 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 8 Jun 2022 14:22:31 +0100 Subject: [PATCH 21/69] KVM: arm64: Move the handling of !FP outside of the fast path We currently start by assuming that the host owns the FP unit at load time, then check again whether this is the case as we are about to run. Only at this point do we account for the fact that there is a (vanishingly small) chance that we're running on a system without a FPSIMD unit (yes, this is madness). We can actually move this FPSIMD check as early as load-time, and drop the check at run time. No intended change in behaviour. Suggested-by: Reiji Watanabe Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 6 ++++++ arch/arm64/kvm/fpsimd.c | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 8b9da9d30485..a9dd7ec38f38 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -328,6 +328,12 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; + /* + * Default value for the FP state, will be overloaded at load + * time if we support FP (pretty likely) + */ + vcpu->arch.fp_state = FP_STATE_FREE; + /* Set up the timer */ kvm_timer_vcpu_init(vcpu); diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 557a96f8e06a..ec8e4494873d 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -77,6 +77,9 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) BUG_ON(!current->mm); BUG_ON(test_thread_flag(TIF_SVE)); + if (!system_supports_fpsimd()) + return; + vcpu->arch.fp_state = FP_STATE_HOST_OWNED; vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); @@ -110,13 +113,10 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) * FP while we were preemptible (such as off the back of an interrupt), * then neither the host nor the guest own the FP hardware (and it was the * responsibility of the code that used FP to save the existing state). - * - * Note that not supporting FP is basically the same thing as far as the - * hypervisor is concerned (nothing to save). */ void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu) { - if (!system_supports_fpsimd() || test_thread_flag(TIF_FOREIGN_FPSTATE)) + if (test_thread_flag(TIF_FOREIGN_FPSTATE)) vcpu->arch.fp_state = FP_STATE_FREE; } From 3d5697f95e492899d0bf813cbab2af03dde77fa2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 13 Jun 2022 18:20:25 +0900 Subject: [PATCH 22/69] KVM: arm64: nvhe: Rename confusing obj-y This Makefile appends several objects to obj-y from line 15, but none of them is linked to vmlinux in an ordinary way. obj-y is overwritten at line 30: obj-y := kvm_nvhe.o So, kvm_nvhe.o is the only object directly linked to vmlinux. Replace the abused obj-y with hyp-obj-y. Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220613092026.1705630-1-masahiroy@kernel.org --- arch/arm64/kvm/hyp/nvhe/Makefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index f9fe4dc21b1f..3c6d3a18171c 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -12,13 +12,13 @@ HOST_EXTRACFLAGS += -I$(objtree)/include lib-objs := clear_page.o copy_page.o memcpy.o memset.o lib-objs := $(addprefix ../../../lib/, $(lib-objs)) -obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ +hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o -obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ +hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o -obj-$(CONFIG_DEBUG_LIST) += list_debug.o -obj-y += $(lib-objs) +hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o +hyp-obj-y += $(lib-objs) ## ## Build rules for compiling nVHE hyp code @@ -26,7 +26,7 @@ obj-y += $(lib-objs) ## file containing all nVHE hyp code and data. ## -hyp-obj := $(patsubst %.o,%.nvhe.o,$(obj-y)) +hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y)) obj-y := kvm_nvhe.o extra-y := $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o From 40c56bd8e1aea7929a09f1d4d68ac3221bb142c4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 13 Jun 2022 18:20:26 +0900 Subject: [PATCH 23/69] KVM: arm64: nvhe: Add intermediates to 'targets' instead of extra-y These are generated on demand. Adding them to 'targets' is enough. Signed-off-by: Masahiro Yamada Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220613092026.1705630-2-masahiroy@kernel.org --- arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 3c6d3a18171c..a2b0d043dddf 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -28,7 +28,7 @@ hyp-obj-y += $(lib-objs) hyp-obj := $(patsubst %.o,%.nvhe.o,$(hyp-obj-y)) obj-y := kvm_nvhe.o -extra-y := $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o +targets += $(hyp-obj) kvm_nvhe.tmp.o kvm_nvhe.rel.o hyp.lds hyp-reloc.S hyp-reloc.o # 1) Compile all source files to `.nvhe.o` object files. The file extension # avoids file name clashes for files shared with VHE. From 1c3ace2b8b3995d3213c5e2d2aca01a0577a3b0f Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 5 Jul 2022 14:23:10 +0000 Subject: [PATCH 24/69] KVM: arm64: Don't return from void function Although harmless, the return statement in kvm_unexpected_el2_exception is rather confusing as the function itself has a void return type. The C standard is also pretty clear that "A return statement with an expression shall not appear in a function whose return type is void". Given that this return statement does not seem to add any actual value, let's not pointlessly violate the standard. Build-tested with GCC 10 and CLANG 13 for good measure, the disassembled code is identical with or without the return statement. Fixes: e9ee186bb735 ("KVM: arm64: Add kvm_extable for vaxorcism code") Signed-off-by: Quentin Perret Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220705142310.3847918-1-qperret@google.com --- arch/arm64/kvm/hyp/nvhe/switch.c | 2 +- arch/arm64/kvm/hyp/vhe/switch.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6db801db8f27..925b34b7708d 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -386,5 +386,5 @@ asmlinkage void __noreturn hyp_panic_bad_stack(void) asmlinkage void kvm_unexpected_el2_exception(void) { - return __kvm_unexpected_el2_exception(); + __kvm_unexpected_el2_exception(); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 969f20daf97a..390af1a6a9b4 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -249,5 +249,5 @@ void __noreturn hyp_panic(void) asmlinkage void kvm_unexpected_el2_exception(void) { - return __kvm_unexpected_el2_exception(); + __kvm_unexpected_el2_exception(); } From 6a4f7fcd750497cb2fa870f799e8b23270bec6e3 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Jul 2022 16:41:08 +0100 Subject: [PATCH 25/69] KVM: arm64: selftests: Add support for GICv2 on v3 The current vgic_init test wrongly assumes that the host cannot multiple versions of the GIC architecture, while v2 emulation on v3 has almost always been supported (it was supported before the standalone v3 emulation). Tweak the test to support multiple GIC incarnations. Signed-off-by: Marc Zyngier Fixes: 3f4db37e203b ("KVM: arm64: selftests: Make vgic_init gic version agnostic") Reviewed-by: Ricardo Koller Link: https://lore.kernel.org/r/20220714154108.3531213-1-maz@kernel.org --- tools/testing/selftests/kvm/aarch64/vgic_init.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/vgic_init.c b/tools/testing/selftests/kvm/aarch64/vgic_init.c index 34379c98d2f4..21ba4002fc18 100644 --- a/tools/testing/selftests/kvm/aarch64/vgic_init.c +++ b/tools/testing/selftests/kvm/aarch64/vgic_init.c @@ -670,7 +670,7 @@ int test_kvm_device(uint32_t gic_dev_type) if (!_kvm_create_device(v.vm, other, true, &fd)) { ret = _kvm_create_device(v.vm, other, false, &fd); - TEST_ASSERT(ret && errno == EINVAL, + TEST_ASSERT(ret && (errno == EINVAL || errno == EEXIST), "create GIC device while other version exists"); } @@ -698,6 +698,7 @@ int main(int ac, char **av) { int ret; int pa_bits; + int cnt_impl = 0; pa_bits = vm_guest_mode_params[VM_MODE_DEFAULT].pa_bits; max_phys_size = 1ULL << pa_bits; @@ -706,17 +707,19 @@ int main(int ac, char **av) if (!ret) { pr_info("Running GIC_v3 tests.\n"); run_tests(KVM_DEV_TYPE_ARM_VGIC_V3); - return 0; + cnt_impl++; } ret = test_kvm_device(KVM_DEV_TYPE_ARM_VGIC_V2); if (!ret) { pr_info("Running GIC_v2 tests.\n"); run_tests(KVM_DEV_TYPE_ARM_VGIC_V2); - return 0; + cnt_impl++; } - print_skip("No GICv2 nor GICv3 support"); - exit(KSFT_SKIP); + if (!cnt_impl) { + print_skip("No GICv2 nor GICv3 support"); + exit(KSFT_SKIP); + } return 0; } From ed6313a93fd11d2015ad17046f3c418bf6a8dab1 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Fri, 15 Jul 2022 16:58:24 -0700 Subject: [PATCH 26/69] KVM: arm64: Fix hypervisor address symbolization With CONFIG_RANDOMIZE_BASE=y vmlinux addresses will resolve incorrectly from kallsyms. Fix this by adding the KASLR offset before printing the symbols. Fixes: 6ccf9cb557bd ("KVM: arm64: Symbolize the nVHE HYP addresses") Reported-by: Fuad Tabba Signed-off-by: Kalesh Singh Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220715235824.2549012-1-kaleshsingh@google.com --- arch/arm64/kvm/handle_exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index f66c0142b335..e43926ef2bc2 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -347,10 +347,10 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else kvm_err("nVHE hyp BUG at: [<%016llx>] %pB!\n", panic_addr, - (void *)panic_addr); + (void *)(panic_addr + kaslr_offset())); } else { kvm_err("nVHE hyp panic at: [<%016llx>] %pB!\n", panic_addr, - (void *)panic_addr); + (void *)(panic_addr + kaslr_offset())); } /* From da8d120fbafe1d3217d25ac45493538b37cff87c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 3 Jul 2022 14:08:46 +0100 Subject: [PATCH 27/69] KVM: arm64: Add get_reg_by_id() as a sys_reg_desc retrieving helper find_reg_by_id() requires a sys_reg_param as input, which most users provide as a on-stack variable, but don't make any use of the result. Provide a helper that doesn't have this requirement and simplify the callers (all but one). Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 28 +++++++++++++++++----------- arch/arm64/kvm/sys_regs.h | 4 ++++ arch/arm64/kvm/vgic-sys-reg-v3.c | 8 ++------ 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index c06c0477fab5..1f410283c592 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2650,21 +2650,29 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, return find_reg(params, table, num); } +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num) +{ + struct sys_reg_params params; + + if (!index_to_params(id, ¶ms)) + return NULL; + + return find_reg(¶ms, table, num); +} + /* Decode an index value, and find the sys_reg_desc entry. */ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id) { const struct sys_reg_desc *r; - struct sys_reg_params params; /* We only do sys_reg for now. */ if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) return NULL; - if (!index_to_params(id, ¶ms)) - return NULL; - - r = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + r = get_reg_by_id(id, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); /* Not saved in the sys_reg array and not otherwise accessible? */ if (r && !(r->reg || r->get_user)) @@ -2723,11 +2731,10 @@ static int reg_to_user(void __user *uaddr, const u64 *val, u64 id) static int get_invariant_sys_reg(u64 id, void __user *uaddr) { - struct sys_reg_params params; const struct sys_reg_desc *r; - r = find_reg_by_id(id, ¶ms, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); + r = get_reg_by_id(id, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; @@ -2736,13 +2743,12 @@ static int get_invariant_sys_reg(u64 id, void __user *uaddr) static int set_invariant_sys_reg(u64 id, void __user *uaddr) { - struct sys_reg_params params; const struct sys_reg_desc *r; int err; u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ - r = find_reg_by_id(id, ¶ms, invariant_sys_regs, - ARRAY_SIZE(invariant_sys_regs)); + r = get_reg_by_id(id, invariant_sys_regs, + ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index aee8ea054f0d..ce30ed9566ae 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -195,6 +195,10 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num); +const struct sys_reg_desc *get_reg_by_id(u64 id, + const struct sys_reg_desc table[], + unsigned int num); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 07d5271e9f05..644acda33c7c 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -263,14 +263,10 @@ static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, u64 *reg) { - struct sys_reg_params params; u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; - params.regval = *reg; - params.is_write = is_write; - - if (find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, - ARRAY_SIZE(gic_v3_icc_reg_descs))) + if (get_reg_by_id(sysreg, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs))) return 0; return -ENXIO; From 1deeffb559663dc44e4b8a61fe7e271fe3b4b836 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 3 Jul 2022 15:11:50 +0100 Subject: [PATCH 28/69] KVM: arm64: Reorder handling of invariant sysregs from userspace In order to allow some further refactor of the sysreg helpers, move the handling of invariant sysreg to occur before we handle all the other ones. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 1f410283c592..9291cb94c2e4 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2849,6 +2849,7 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg { const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; + int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(reg->id, uaddr); @@ -2856,12 +2857,14 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) return -ENOENT; + err = get_invariant_sys_reg(reg->id, uaddr); + if (err != -ENOENT) + return err; + r = index_to_sys_reg_desc(vcpu, reg->id); - if (!r) - return get_invariant_sys_reg(reg->id, uaddr); /* Check for regs disabled by runtime config */ - if (sysreg_hidden(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->get_user) @@ -2874,6 +2877,7 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg { const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; + int err; if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(reg->id, uaddr); @@ -2881,12 +2885,14 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) return -ENOENT; + err = set_invariant_sys_reg(reg->id, uaddr); + if (err != -ENOENT) + return err; + r = index_to_sys_reg_desc(vcpu, reg->id); - if (!r) - return set_invariant_sys_reg(reg->id, uaddr); /* Check for regs disabled by runtime config */ - if (sysreg_hidden(vcpu, r)) + if (!r || sysreg_hidden(vcpu, r)) return -ENOENT; if (r->set_user) From ba23aec9f4f27c00ac7a504aae60cae8a4087a19 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 3 Jul 2022 16:06:51 +0100 Subject: [PATCH 29/69] KVM: arm64: Introduce generic get_user/set_user helpers for system registers The userspace access to the system registers is done using helpers that hardcode the table that is looked up. extract some generic helpers from this, moving the handling of hidden sysregs into the core code. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 60 +++++++++++++++++++++++++-------------- arch/arm64/kvm/sys_regs.h | 6 ++++ 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9291cb94c2e4..0fbdb21a3600 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2663,8 +2663,10 @@ const struct sys_reg_desc *get_reg_by_id(u64 id, } /* Decode an index value, and find the sys_reg_desc entry. */ -static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, - u64 id) +static const struct sys_reg_desc * +id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, + const struct sys_reg_desc table[], unsigned int num) + { const struct sys_reg_desc *r; @@ -2672,10 +2674,10 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu, if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG) return NULL; - r = get_reg_by_id(id, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + r = get_reg_by_id(id, table, num); /* Not saved in the sys_reg array and not otherwise accessible? */ - if (r && !(r->reg || r->get_user)) + if (r && (!(r->reg || r->get_user) || sysreg_hidden(vcpu, r))) r = NULL; return r; @@ -2845,9 +2847,24 @@ static int demux_c15_set(u64 id, void __user *uaddr) } } +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) +{ + void __user *uaddr = (void __user *)(unsigned long)reg->addr; + const struct sys_reg_desc *r; + + r = id_to_sys_reg_desc(vcpu, reg->id, table, num); + if (!r) + return -ENOENT; + + if (r->get_user) + return (r->get_user)(vcpu, r, reg, uaddr); + + return reg_to_user(uaddr, &__vcpu_sys_reg(vcpu, r->reg), reg->id); +} + int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { - const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; int err; @@ -2861,21 +2878,28 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (err != -ENOENT) return err; - r = index_to_sys_reg_desc(vcpu, reg->id); + return kvm_sys_reg_get_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); +} - /* Check for regs disabled by runtime config */ - if (!r || sysreg_hidden(vcpu, r)) +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num) +{ + void __user *uaddr = (void __user *)(unsigned long)reg->addr; + const struct sys_reg_desc *r; + + r = id_to_sys_reg_desc(vcpu, reg->id, table, num); + if (!r) return -ENOENT; - if (r->get_user) - return (r->get_user)(vcpu, r, reg, uaddr); + if (r->set_user) + return (r->set_user)(vcpu, r, reg, uaddr); - return reg_to_user(uaddr, &__vcpu_sys_reg(vcpu, r->reg), reg->id); + return reg_from_user(&__vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); } int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { - const struct sys_reg_desc *r; void __user *uaddr = (void __user *)(unsigned long)reg->addr; int err; @@ -2889,16 +2913,8 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if (err != -ENOENT) return err; - r = index_to_sys_reg_desc(vcpu, reg->id); - - /* Check for regs disabled by runtime config */ - if (!r || sysreg_hidden(vcpu, r)) - return -ENOENT; - - if (r->set_user) - return (r->set_user)(vcpu, r, reg, uaddr); - - return reg_from_user(&__vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); + return kvm_sys_reg_set_user(vcpu, reg, + sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); } static unsigned int num_demux_regs(void) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index ce30ed9566ae..4fb6d59e7874 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -199,6 +199,12 @@ const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num); +int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); + +int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, + const struct sys_reg_desc table[], unsigned int num); + #define AA32(_x) .aarch32_map = AA32_##_x #define Op0(_x) .Op0 = _x #define Op1(_x) .Op1 = _x From e48407ff9796529a1e5048b9e4d6ea8a0334468e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 17:01:50 +0100 Subject: [PATCH 30/69] KVM: arm64: Rely on index_to_param() for size checks on userspace access index_to_param() already checks that we use 64bit accesses for all registers accessed from userspace. However, we have extra checks in other places (such as index_to_params), which is pretty confusing. Get rid off these redundant checks. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 0fbdb21a3600..5dbe0f4b8167 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2871,9 +2871,6 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_get(reg->id, uaddr); - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) - return -ENOENT; - err = get_invariant_sys_reg(reg->id, uaddr); if (err != -ENOENT) return err; @@ -2906,9 +2903,6 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) return demux_c15_set(reg->id, uaddr); - if (KVM_REG_SIZE(reg->id) != sizeof(__u64)) - return -ENOENT; - err = set_invariant_sys_reg(reg->id, uaddr); if (err != -ENOENT) return err; From 978ceeb3e40a59973ff1d1c3d23484f71f141819 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 17:27:00 +0100 Subject: [PATCH 31/69] KVM: arm64: Consolidate sysreg userspace accesses Until now, the .set_user and .get_user callbacks have to implement (directly or not) the userspace memory accesses. Although this gives us maximem flexibility, this is also a maintenance burden, making it hard to audit, and I'd feel much better if it was all located in a single place. So let's do just that, simplifying most of the function signatures in the process (the callbacks are now only concerned with the data itself, and not with userspace). Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 160 ++++++++++++++------------------------ arch/arm64/kvm/sys_regs.h | 4 +- 2 files changed, 61 insertions(+), 103 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 5dbe0f4b8167..526798524697 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -321,16 +321,8 @@ static bool trap_oslsr_el1(struct kvm_vcpu *vcpu, } static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - u64 id = sys_reg_to_index(rd); - u64 val; - int err; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - /* * The only modifiable bit is the OSLK bit. Refuse the write if * userspace attempts to change any other bit in the register. @@ -451,22 +443,16 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, } static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val; return 0; } static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; return 0; } @@ -493,23 +479,16 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, } static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; - + vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val; return 0; } static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; return 0; } @@ -537,22 +516,16 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, } static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val; return 0; } static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; return 0; } @@ -579,22 +552,16 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, } static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (copy_from_user(r, uaddr, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val; return 0; } static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - __u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0) - return -EFAULT; + *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; return 0; } @@ -1227,16 +1194,9 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - const u64 id = sys_reg_to_index(rd); u8 csv2, csv3; - int err; - u64 val; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; /* * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as @@ -1262,7 +1222,7 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, return -EINVAL; vcpu->kvm->arch.pfr0_csv2 = csv2; - vcpu->kvm->arch.pfr0_csv3 = csv3 ; + vcpu->kvm->arch.pfr0_csv3 = csv3; return 0; } @@ -1275,27 +1235,17 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, * to be changed. */ static int __get_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, void __user *uaddr, + const struct sys_reg_desc *rd, u64 *val, bool raz) { - const u64 id = sys_reg_to_index(rd); - const u64 val = read_id_reg(vcpu, rd, raz); - - return reg_to_user(uaddr, &val, id); + *val = read_id_reg(vcpu, rd, raz); + return 0; } static int __set_id_reg(const struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, void __user *uaddr, + const struct sys_reg_desc *rd, u64 val, bool raz) { - const u64 id = sys_reg_to_index(rd); - int err; - u64 val; - - err = reg_from_user(&val, uaddr, id); - if (err) - return err; - /* This is what we mean by invariant: you can't change it. */ if (val != read_id_reg(vcpu, rd, raz)) return -EINVAL; @@ -1304,47 +1254,37 @@ static int __set_id_reg(const struct kvm_vcpu *vcpu, } static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { bool raz = sysreg_visible_as_raz(vcpu, rd); - return __get_id_reg(vcpu, rd, uaddr, raz); + return __get_id_reg(vcpu, rd, val, raz); } static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { bool raz = sysreg_visible_as_raz(vcpu, rd); - return __set_id_reg(vcpu, rd, uaddr, raz); + return __set_id_reg(vcpu, rd, val, raz); } static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - return __set_id_reg(vcpu, rd, uaddr, true); + return __set_id_reg(vcpu, rd, val, true); } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 *val) { - const u64 id = sys_reg_to_index(rd); - const u64 val = 0; - - return reg_to_user(uaddr, &val, id); + *val = 0; + return 0; } static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr) + u64 val) { - int err; - u64 val; - - /* Perform the access even if we are going to ignore the value */ - err = reg_from_user(&val, uaddr, sys_reg_to_index(rd)); - if (err) - return err; - return 0; } @@ -2850,17 +2790,26 @@ static int demux_c15_set(u64 id, void __user *uaddr) int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { - void __user *uaddr = (void __user *)(unsigned long)reg->addr; + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; + u64 val; + int ret; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); if (!r) return -ENOENT; - if (r->get_user) - return (r->get_user)(vcpu, r, reg, uaddr); + if (r->get_user) { + ret = (r->get_user)(vcpu, r, &val); + } else { + val = __vcpu_sys_reg(vcpu, r->reg); + ret = 0; + } - return reg_to_user(uaddr, &__vcpu_sys_reg(vcpu, r->reg), reg->id); + if (!ret) + ret = put_user(val, uaddr); + + return ret; } int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) @@ -2882,17 +2831,26 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num) { - void __user *uaddr = (void __user *)(unsigned long)reg->addr; + u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr; const struct sys_reg_desc *r; + u64 val; + int ret; + + if (get_user(val, uaddr)) + return -EFAULT; r = id_to_sys_reg_desc(vcpu, reg->id, table, num); if (!r) return -ENOENT; - if (r->set_user) - return (r->set_user)(vcpu, r, reg, uaddr); + if (r->set_user) { + ret = (r->set_user)(vcpu, r, val); + } else { + __vcpu_sys_reg(vcpu, r->reg) = val; + ret = 0; + } - return reg_from_user(&__vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id); + return ret; } int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 4fb6d59e7874..b8b576a2af2b 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -75,9 +75,9 @@ struct sys_reg_desc { /* Custom get/set_user functions, fallback to generic if NULL */ int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr); + u64 *val); int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - const struct kvm_one_reg *reg, void __user *uaddr); + u64 val); /* Return mask of REG_* runtime visibility overrides */ unsigned int (*visibility)(const struct kvm_vcpu *vcpu, From 5a420ed9646a934e983358aeba1bf3cd993d1cc5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 17:55:43 +0100 Subject: [PATCH 32/69] KVM: arm64: Get rid of reg_from/to_user() These helpers are only used by the invariant stuff now, and while they pretend to support non-64bit registers, this only serves as a way to scare the casual reviewer... Replace these helpers with our good friends get/put_user(), and don't look back. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 526798524697..379478eecfaa 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -44,8 +44,6 @@ * 64bit interface. */ -static int reg_from_user(u64 *val, const void __user *uaddr, u64 id); -static int reg_to_user(void __user *uaddr, const u64 *val, u64 id); static u64 sys_reg_to_index(const struct sys_reg_desc *reg); static bool read_from_write_only(struct kvm_vcpu *vcpu, @@ -2657,21 +2655,7 @@ static struct sys_reg_desc invariant_sys_regs[] = { { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, }; -static int reg_from_user(u64 *val, const void __user *uaddr, u64 id) -{ - if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int reg_to_user(void __user *uaddr, const u64 *val, u64 id) -{ - if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int get_invariant_sys_reg(u64 id, void __user *uaddr) +static int get_invariant_sys_reg(u64 id, u64 __user *uaddr) { const struct sys_reg_desc *r; @@ -2680,23 +2664,21 @@ static int get_invariant_sys_reg(u64 id, void __user *uaddr) if (!r) return -ENOENT; - return reg_to_user(uaddr, &r->val, id); + return put_user(r->val, uaddr); } -static int set_invariant_sys_reg(u64 id, void __user *uaddr) +static int set_invariant_sys_reg(u64 id, u64 __user *uaddr) { const struct sys_reg_desc *r; - int err; - u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */ + u64 val; r = get_reg_by_id(id, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs)); if (!r) return -ENOENT; - err = reg_from_user(&val, uaddr, id); - if (err) - return err; + if (get_user(val, uaddr)) + return -EFAULT; /* This is what we mean by invariant: you can't change it. */ if (r->val != val) From b61fc0857a3ad4cdee44128ad13685033e237367 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 3 Jul 2022 14:57:29 +0100 Subject: [PATCH 33/69] KVM: arm64: vgic-v3: Simplify vgic_v3_has_cpu_sysregs_attr() Finding out whether a sysreg exists has little to do with that register being accessed, so drop the is_write parameter. Also, the reg pointer is completely unused, and we're better off just passing the attr pointer to the function. This result in a small cleanup of the calling site, with a new helper converting the vGIC view of a sysreg into the canonical one (this is purely cosmetic, as the encoding is the same). Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic-sys-reg-v3.c | 14 ++++++++++---- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 8 ++------ arch/arm64/kvm/vgic/vgic.h | 3 +-- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 644acda33c7c..85a5e1d15e9f 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -260,12 +260,18 @@ static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { { SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 }, }; -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) +static u64 attr_to_id(u64 attr) { - u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; + return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP1_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRN_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRM_MASK, attr), + FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP2_MASK, attr)); +} - if (get_reg_by_id(sysreg, gic_v3_icc_reg_descs, +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs, ARRAY_SIZE(gic_v3_icc_reg_descs))) return 0; diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index f15e29cc63ce..a2ff73899976 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -986,12 +986,8 @@ int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr) iodev.base_addr = 0; break; } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 reg, id; - - id = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - return vgic_v3_has_cpu_sysregs_attr(vcpu, 0, id, ®); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_has_cpu_sysregs_attr(vcpu, attr); default: return -ENXIO; } diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 4c6bdd321faa..ffc2d3c81b28 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -247,8 +247,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, u64 *val); -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg); +int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, u32 intid, u64 *val); int kvm_register_vgic_device(unsigned long type); From db25081e147c3cc496b8cd8c9d67f992546df6d5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 08:07:44 +0100 Subject: [PATCH 34/69] KVM: arm64: vgic-v3: Push user access into vgic_v3_cpu_sysregs_uaccess() In order to start making the vgic sysreg access from userspace similar to all the other sysregs, push the userspace memory access one level down into vgic_v3_cpu_sysregs_uaccess(). The next step will be to rely on the sysreg infrastructure to perform this task. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic-sys-reg-v3.c | 22 ++++++++++++------ arch/arm64/kvm/vgic/vgic-kvm-device.c | 33 ++++++--------------------- arch/arm64/kvm/vgic/vgic.h | 4 ++-- 3 files changed, 24 insertions(+), 35 deletions(-) diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 85a5e1d15e9f..88eb5b049c2c 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -278,15 +278,21 @@ int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr * return -ENXIO; } -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, + bool is_write) { + u64 __user *uaddr = (u64 __user *)(long)attr->addr; struct sys_reg_params params; const struct sys_reg_desc *r; - u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64; + u64 sysreg; - if (is_write) - params.regval = *reg; + sysreg = attr_to_id(attr->attr); + + if (is_write) { + if (get_user(params.regval, uaddr)) + return -EFAULT; + } params.is_write = is_write; r = find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, @@ -297,8 +303,10 @@ int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, if (!r->access(vcpu, ¶ms, r)) return -EINVAL; - if (!is_write) - *reg = params.regval; + if (!is_write) { + if (put_user(params.regval, uaddr)) + return -EFAULT; + } return 0; } diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index c6d52a1fd9c8..bf745c6ab2ea 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -512,7 +512,7 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, * * @dev: kvm device handle * @attr: kvm device attribute - * @reg: address the value is read or written + * @reg: address the value is read or written, NULL for sysregs * @is_write: true if userspace is writing a register */ static int vgic_v3_attr_regs_access(struct kvm_device *dev, @@ -561,14 +561,9 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, if (!is_write) *reg = tmp32; break; - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 regid; - - regid = (attr->attr & KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK); - ret = vgic_v3_cpu_sysregs_uaccess(vcpu, is_write, - regid, reg); + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write); break; - } case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { unsigned int info, intid; @@ -617,15 +612,8 @@ static int vgic_v3_set_attr(struct kvm_device *dev, reg = tmp32; return vgic_v3_attr_regs_access(dev, attr, ®, true); } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_attr_regs_access(dev, attr, NULL, true); case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; u64 reg; @@ -681,15 +669,8 @@ static int vgic_v3_get_attr(struct kvm_device *dev, tmp32 = reg; return put_user(tmp32, uaddr); } - case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 reg; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + return vgic_v3_attr_regs_access(dev, attr, NULL, false); case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; u64 reg; diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index ffc2d3c81b28..c23118467a35 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -245,8 +245,8 @@ int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, int offset, u32 *val); -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u64 id, u64 *val); +int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, + struct kvm_device_attr *attr, bool is_write); int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, u32 intid, u64 *val); From cbcf14dd23bcf228eb6061991acf3721506b97ae Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 09:57:38 +0100 Subject: [PATCH 35/69] KVM: arm64: vgic-v3: Make the userspace accessors use sysreg API The vgic-v3 sysreg accessors have been ignored as the rest of the sysreg internal API was evolving, and are stuck with the .access method (which is normally reserved to the guest's own access) for the userspace accesses (which should use the .set/.get_user() methods). Catch up with the program and repaint all the accessors so that they fit into the normal userspace model, and plug the result into the helpers that have been introduced earlier. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic-sys-reg-v3.c | 456 +++++++++++++++++-------------- 1 file changed, 258 insertions(+), 198 deletions(-) diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index 88eb5b049c2c..b755b02bc8ba 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -10,254 +10,330 @@ #include "vgic/vgic.h" #include "sys_regs.h" -static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v; struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + + /* + * Disallow restoring VM state if not supported by this + * hardware. + */ + host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >> + ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1; + if (host_pri_bits > vgic_v3_cpu->num_pri_bits) + return -EINVAL; + + vgic_v3_cpu->num_pri_bits = host_pri_bits; + + host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >> + ICC_CTLR_EL1_ID_BITS_SHIFT; + if (host_id_bits > vgic_v3_cpu->num_id_bits) + return -EINVAL; + + vgic_v3_cpu->num_id_bits = host_id_bits; + + host_seis = ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT); + seis = (val & ICC_CTLR_EL1_SEIS_MASK) >> + ICC_CTLR_EL1_SEIS_SHIFT; + if (host_seis != seis) + return -EINVAL; + + host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT); + a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT; + if (host_a3v != a3v) + return -EINVAL; + + /* + * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. + * The vgic_set_vmcr() will convert to ICH_VMCR layout. + */ + vmcr.cbpr = (val & ICC_CTLR_EL1_CBPR_MASK) >> ICC_CTLR_EL1_CBPR_SHIFT; + vmcr.eoim = (val & ICC_CTLR_EL1_EOImode_MASK) >> ICC_CTLR_EL1_EOImode_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *valp) +{ + struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; + struct vgic_vmcr vmcr; u64 val; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - val = p->regval; + val = 0; + val |= (vgic_v3_cpu->num_pri_bits - 1) << ICC_CTLR_EL1_PRI_BITS_SHIFT; + val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT; + val |= ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) << + ICC_CTLR_EL1_SEIS_SHIFT; + val |= ((kvm_vgic_global_state.ich_vtr_el2 & + ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) << + ICC_CTLR_EL1_A3V_SHIFT; + /* + * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. + * Extract it directly using ICC_CTLR_EL1 reg definitions. + */ + val |= (vmcr.cbpr << ICC_CTLR_EL1_CBPR_SHIFT) & ICC_CTLR_EL1_CBPR_MASK; + val |= (vmcr.eoim << ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK; - /* - * Disallow restoring VM state if not supported by this - * hardware. - */ - host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >> - ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1; - if (host_pri_bits > vgic_v3_cpu->num_pri_bits) - return false; + *valp = val; - vgic_v3_cpu->num_pri_bits = host_pri_bits; - - host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >> - ICC_CTLR_EL1_ID_BITS_SHIFT; - if (host_id_bits > vgic_v3_cpu->num_id_bits) - return false; - - vgic_v3_cpu->num_id_bits = host_id_bits; - - host_seis = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT); - seis = (val & ICC_CTLR_EL1_SEIS_MASK) >> - ICC_CTLR_EL1_SEIS_SHIFT; - if (host_seis != seis) - return false; - - host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT); - a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT; - if (host_a3v != a3v) - return false; - - /* - * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. - * The vgic_set_vmcr() will convert to ICH_VMCR layout. - */ - vmcr.cbpr = (val & ICC_CTLR_EL1_CBPR_MASK) >> ICC_CTLR_EL1_CBPR_SHIFT; - vmcr.eoim = (val & ICC_CTLR_EL1_EOImode_MASK) >> ICC_CTLR_EL1_EOImode_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - val = 0; - val |= (vgic_v3_cpu->num_pri_bits - 1) << - ICC_CTLR_EL1_PRI_BITS_SHIFT; - val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) << - ICC_CTLR_EL1_SEIS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) << - ICC_CTLR_EL1_A3V_SHIFT; - /* - * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. - * Extract it directly using ICC_CTLR_EL1 reg definitions. - */ - val |= (vmcr.cbpr << ICC_CTLR_EL1_CBPR_SHIFT) & ICC_CTLR_EL1_CBPR_MASK; - val |= (vmcr.eoim << ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK; - - p->regval = val; - } - - return true; + return 0; } -static bool access_gic_pmr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.pmr = (p->regval & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK; - } + vmcr.pmr = (val & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); - return true; + return 0; } -static bool access_gic_bpr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.bpr = (p->regval & ICC_BPR0_EL1_MASK) >> - ICC_BPR0_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) & - ICC_BPR0_EL1_MASK; - } + *val = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK; - return true; + return 0; } -static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; - if (!p->is_write) - p->regval = 0; + vgic_get_vmcr(vcpu, &vmcr); + vmcr.bpr = (val & ICC_BPR0_EL1_MASK) >> ICC_BPR0_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) & ICC_BPR0_EL1_MASK; + + return 0; +} + +static int set_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); if (!vmcr.cbpr) { - if (p->is_write) { - vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >> - ICC_BPR1_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) & - ICC_BPR1_EL1_MASK; - } - } else { - if (!p->is_write) - p->regval = min((vmcr.bpr + 1), 7U); + vmcr.abpr = (val & ICC_BPR1_EL1_MASK) >> ICC_BPR1_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); } - return true; + return 0; } -static bool access_gic_grpen0(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.grpen0 = (p->regval & ICC_IGRPEN0_EL1_MASK) >> - ICC_IGRPEN0_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) & - ICC_IGRPEN0_EL1_MASK; - } + if (!vmcr.cbpr) + *val = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) & ICC_BPR1_EL1_MASK; + else + *val = min((vmcr.bpr + 1), 7U); - return true; + + return 0; } -static bool access_gic_grpen1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - if (p->is_write) { - vmcr.grpen1 = (p->regval & ICC_IGRPEN1_EL1_MASK) >> - ICC_IGRPEN1_EL1_SHIFT; - vgic_set_vmcr(vcpu, &vmcr); - } else { - p->regval = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) & - ICC_IGRPEN1_EL1_MASK; - } + vmcr.grpen0 = (val & ICC_IGRPEN0_EL1_MASK) >> ICC_IGRPEN0_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); - return true; + return 0; } -static void vgic_v3_access_apr_reg(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, u8 apr, u8 idx) +static int get_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) & ICC_IGRPEN0_EL1_MASK; + + return 0; +} + +static int set_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + vmcr.grpen1 = (val & ICC_IGRPEN1_EL1_MASK) >> ICC_IGRPEN1_EL1_SHIFT; + vgic_set_vmcr(vcpu, &vmcr); + + return 0; +} + +static int get_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + struct vgic_vmcr vmcr; + + vgic_get_vmcr(vcpu, &vmcr); + *val = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) & ICC_IGRPEN1_EL1_MASK; + + return 0; +} + +static void set_apr_reg(struct kvm_vcpu *vcpu, u64 val, u8 apr, u8 idx) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - uint32_t *ap_reg; if (apr) - ap_reg = &vgicv3->vgic_ap1r[idx]; + vgicv3->vgic_ap1r[idx] = val; else - ap_reg = &vgicv3->vgic_ap0r[idx]; - - if (p->is_write) - *ap_reg = p->regval; - else - p->regval = *ap_reg; + vgicv3->vgic_ap0r[idx] = val; } -static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r, u8 apr) +static u64 get_apr_reg(struct kvm_vcpu *vcpu, u8 apr, u8 idx) +{ + struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; + + if (apr) + return vgicv3->vgic_ap1r[idx]; + else + return vgicv3->vgic_ap0r[idx]; +} + +static int set_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) + { u8 idx = r->Op2 & 3; if (idx > vgic_v3_max_apr_idx(vcpu)) - goto err; + return -EINVAL; - vgic_v3_access_apr_reg(vcpu, p, apr, idx); - return true; -err: - if (!p->is_write) - p->regval = 0; - - return false; + set_apr_reg(vcpu, val, 0, idx); + return 0; } -static bool access_gic_ap0r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) +{ + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + *val = get_apr_reg(vcpu, 0, idx); + + return 0; +} + +static int set_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) { - return access_gic_aprn(vcpu, p, r, 0); + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + set_apr_reg(vcpu, val, 1, idx); + return 0; } -static bool access_gic_ap1r(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int get_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { - return access_gic_aprn(vcpu, p, r, 1); + u8 idx = r->Op2 & 3; + + if (idx > vgic_v3_max_apr_idx(vcpu)) + return -EINVAL; + + *val = get_apr_reg(vcpu, 1, idx); + + return 0; } -static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p, - const struct sys_reg_desc *r) +static int set_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 val) +{ + /* Validate SRE bit */ + if (!(val & ICC_SRE_EL1_SRE)) + return -EINVAL; + + return 0; +} + +static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, + u64 *val) { struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3; - /* Validate SRE bit */ - if (p->is_write) { - if (!(p->regval & ICC_SRE_EL1_SRE)) - return false; - } else { - p->regval = vgicv3->vgic_sre; - } + *val = vgicv3->vgic_sre; - return true; + return 0; } + static const struct sys_reg_desc gic_v3_icc_reg_descs[] = { - { SYS_DESC(SYS_ICC_PMR_EL1), access_gic_pmr }, - { SYS_DESC(SYS_ICC_BPR0_EL1), access_gic_bpr0 }, - { SYS_DESC(SYS_ICC_AP0R0_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R1_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R2_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP0R3_EL1), access_gic_ap0r }, - { SYS_DESC(SYS_ICC_AP1R0_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R1_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R2_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_AP1R3_EL1), access_gic_ap1r }, - { SYS_DESC(SYS_ICC_BPR1_EL1), access_gic_bpr1 }, - { SYS_DESC(SYS_ICC_CTLR_EL1), access_gic_ctlr }, - { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, - { SYS_DESC(SYS_ICC_IGRPEN0_EL1), access_gic_grpen0 }, - { SYS_DESC(SYS_ICC_IGRPEN1_EL1), access_gic_grpen1 }, + { SYS_DESC(SYS_ICC_PMR_EL1), + .set_user = set_gic_pmr, .get_user = get_gic_pmr, }, + { SYS_DESC(SYS_ICC_BPR0_EL1), + .set_user = set_gic_bpr0, .get_user = get_gic_bpr0, }, + { SYS_DESC(SYS_ICC_AP0R0_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R1_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R2_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP0R3_EL1), + .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, }, + { SYS_DESC(SYS_ICC_AP1R0_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R1_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R2_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_AP1R3_EL1), + .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, }, + { SYS_DESC(SYS_ICC_BPR1_EL1), + .set_user = set_gic_bpr1, .get_user = get_gic_bpr1, }, + { SYS_DESC(SYS_ICC_CTLR_EL1), + .set_user = set_gic_ctlr, .get_user = get_gic_ctlr, }, + { SYS_DESC(SYS_ICC_SRE_EL1), + .set_user = set_gic_sre, .get_user = get_gic_sre, }, + { SYS_DESC(SYS_ICC_IGRPEN0_EL1), + .set_user = set_gic_grpen0, .get_user = get_gic_grpen0, }, + { SYS_DESC(SYS_ICC_IGRPEN1_EL1), + .set_user = set_gic_grpen1, .get_user = get_gic_grpen1, }, }; static u64 attr_to_id(u64 attr) @@ -282,31 +358,15 @@ int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr, bool is_write) { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - struct sys_reg_params params; - const struct sys_reg_desc *r; - u64 sysreg; + struct kvm_one_reg reg = { + .id = attr_to_id(attr->attr), + .addr = attr->addr, + }; - sysreg = attr_to_id(attr->attr); - - if (is_write) { - if (get_user(params.regval, uaddr)) - return -EFAULT; - } - params.is_write = is_write; - - r = find_reg_by_id(sysreg, ¶ms, gic_v3_icc_reg_descs, - ARRAY_SIZE(gic_v3_icc_reg_descs)); - if (!r) - return -ENXIO; - - if (!r->access(vcpu, ¶ms, r)) - return -EINVAL; - - if (!is_write) { - if (put_user(params.regval, uaddr)) - return -EFAULT; - } - - return 0; + if (is_write) + return kvm_sys_reg_set_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); + else + return kvm_sys_reg_get_user(vcpu, ®, gic_v3_icc_reg_descs, + ARRAY_SIZE(gic_v3_icc_reg_descs)); } From 71c3c7753c722b8b10566dcdf1ff0a2eaf33a9c1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Jul 2022 08:11:54 +0100 Subject: [PATCH 36/69] KVM: arm64: vgic-v3: Convert userspace accessors over to FIELD_GET/FIELD_PREP The GICv3 userspace accessors are all about dealing with conversion between fields from architectural registers and internal representations. However, and owing to the age of this code, the accessors use a combination of shift/mask that is hard to read. It is nonetheless easy to make it better by using the FIELD_{GET,PREP} macros that solely rely on a mask. This results in somewhat nicer looking code, and is probably easier to maintain. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic-sys-reg-v3.c | 60 ++++++++++++++------------------ 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c index b755b02bc8ba..9e7c486b48c2 100644 --- a/arch/arm64/kvm/vgic-sys-reg-v3.c +++ b/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -23,30 +23,25 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, * Disallow restoring VM state if not supported by this * hardware. */ - host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >> - ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1; + host_pri_bits = FIELD_GET(ICC_CTLR_EL1_PRI_BITS_MASK, val) + 1; if (host_pri_bits > vgic_v3_cpu->num_pri_bits) return -EINVAL; vgic_v3_cpu->num_pri_bits = host_pri_bits; - host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >> - ICC_CTLR_EL1_ID_BITS_SHIFT; + host_id_bits = FIELD_GET(ICC_CTLR_EL1_ID_BITS_MASK, val); if (host_id_bits > vgic_v3_cpu->num_id_bits) return -EINVAL; vgic_v3_cpu->num_id_bits = host_id_bits; - host_seis = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT); - seis = (val & ICC_CTLR_EL1_SEIS_MASK) >> - ICC_CTLR_EL1_SEIS_SHIFT; + host_seis = FIELD_GET(ICH_VTR_SEIS_MASK, kvm_vgic_global_state.ich_vtr_el2); + seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val); if (host_seis != seis) return -EINVAL; - host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT); - a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT; + host_a3v = FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2); + a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val); if (host_a3v != a3v) return -EINVAL; @@ -54,8 +49,8 @@ static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, * Here set VMCR.CTLR in ICC_CTLR_EL1 layout. * The vgic_set_vmcr() will convert to ICH_VMCR layout. */ - vmcr.cbpr = (val & ICC_CTLR_EL1_CBPR_MASK) >> ICC_CTLR_EL1_CBPR_SHIFT; - vmcr.eoim = (val & ICC_CTLR_EL1_EOImode_MASK) >> ICC_CTLR_EL1_EOImode_SHIFT; + vmcr.cbpr = FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val); + vmcr.eoim = FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; @@ -70,20 +65,19 @@ static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, vgic_get_vmcr(vcpu, &vmcr); val = 0; - val |= (vgic_v3_cpu->num_pri_bits - 1) << ICC_CTLR_EL1_PRI_BITS_SHIFT; - val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) << - ICC_CTLR_EL1_SEIS_SHIFT; - val |= ((kvm_vgic_global_state.ich_vtr_el2 & - ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) << - ICC_CTLR_EL1_A3V_SHIFT; + val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1); + val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits); + val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK, + FIELD_GET(ICH_VTR_SEIS_MASK, + kvm_vgic_global_state.ich_vtr_el2)); + val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK, + FIELD_GET(ICH_VTR_A3V_MASK, kvm_vgic_global_state.ich_vtr_el2)); /* * The VMCR.CTLR value is in ICC_CTLR_EL1 layout. * Extract it directly using ICC_CTLR_EL1 reg definitions. */ - val |= (vmcr.cbpr << ICC_CTLR_EL1_CBPR_SHIFT) & ICC_CTLR_EL1_CBPR_MASK; - val |= (vmcr.eoim << ICC_CTLR_EL1_EOImode_SHIFT) & ICC_CTLR_EL1_EOImode_MASK; + val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, vmcr.cbpr); + val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, vmcr.eoim); *valp = val; @@ -96,7 +90,7 @@ static int set_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - vmcr.pmr = (val & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT; + vmcr.pmr = FIELD_GET(ICC_PMR_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; @@ -108,7 +102,7 @@ static int get_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - *val = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK; + *val = FIELD_PREP(ICC_PMR_EL1_MASK, vmcr.pmr); return 0; } @@ -119,7 +113,7 @@ static int set_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - vmcr.bpr = (val & ICC_BPR0_EL1_MASK) >> ICC_BPR0_EL1_SHIFT; + vmcr.bpr = FIELD_GET(ICC_BPR0_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; @@ -131,7 +125,7 @@ static int get_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - *val = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) & ICC_BPR0_EL1_MASK; + *val = FIELD_PREP(ICC_BPR0_EL1_MASK, vmcr.bpr); return 0; } @@ -143,7 +137,7 @@ static int set_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, vgic_get_vmcr(vcpu, &vmcr); if (!vmcr.cbpr) { - vmcr.abpr = (val & ICC_BPR1_EL1_MASK) >> ICC_BPR1_EL1_SHIFT; + vmcr.abpr = FIELD_GET(ICC_BPR1_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); } @@ -157,7 +151,7 @@ static int get_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, vgic_get_vmcr(vcpu, &vmcr); if (!vmcr.cbpr) - *val = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) & ICC_BPR1_EL1_MASK; + *val = FIELD_PREP(ICC_BPR1_EL1_MASK, vmcr.abpr); else *val = min((vmcr.bpr + 1), 7U); @@ -171,7 +165,7 @@ static int set_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - vmcr.grpen0 = (val & ICC_IGRPEN0_EL1_MASK) >> ICC_IGRPEN0_EL1_SHIFT; + vmcr.grpen0 = FIELD_GET(ICC_IGRPEN0_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; @@ -183,7 +177,7 @@ static int get_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - *val = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) & ICC_IGRPEN0_EL1_MASK; + *val = FIELD_PREP(ICC_IGRPEN0_EL1_MASK, vmcr.grpen0); return 0; } @@ -194,7 +188,7 @@ static int set_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - vmcr.grpen1 = (val & ICC_IGRPEN1_EL1_MASK) >> ICC_IGRPEN1_EL1_SHIFT; + vmcr.grpen1 = FIELD_GET(ICC_IGRPEN1_EL1_MASK, val); vgic_set_vmcr(vcpu, &vmcr); return 0; @@ -206,7 +200,7 @@ static int get_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, struct vgic_vmcr vmcr; vgic_get_vmcr(vcpu, &vmcr); - *val = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) & ICC_IGRPEN1_EL1_MASK; + *val = FIELD_GET(ICC_IGRPEN1_EL1_MASK, vmcr.grpen1); return 0; } From 38cf0bb7625a58625efeef9ec944671464ff7430 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Jul 2022 10:16:44 +0100 Subject: [PATCH 37/69] KVM: arm64: vgic-v3: Use u32 to manage the line level from userspace Despite the userspace ABI clearly defining the bits dealt with by KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO as a __u32, the kernel uses a u64. Use a u32 to match the userspace ABI, which will subsequently lead to some simplifications. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-kvm-device.c | 6 +++++- arch/arm64/kvm/vgic/vgic-mmio-v3.c | 2 +- arch/arm64/kvm/vgic/vgic-mmio.c | 6 +++--- arch/arm64/kvm/vgic/vgic-mmio.h | 4 ++-- arch/arm64/kvm/vgic/vgic.h | 2 +- 5 files changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index bf745c6ab2ea..f02294b9aef1 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -570,10 +570,14 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { + if (is_write) + tmp32 = *reg; intid = attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, - intid, reg); + intid, &tmp32); + if (!is_write) + *reg = tmp32; } else { ret = -EINVAL; } diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v3.c b/arch/arm64/kvm/vgic/vgic-mmio-v3.c index a2ff73899976..91201f743033 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v3.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v3.c @@ -1154,7 +1154,7 @@ int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write, } int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val) + u32 intid, u32 *val) { if (intid % 32) return -EINVAL; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 997d0fce2088..b32d434c1d4a 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -775,10 +775,10 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, } } -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) { int i; - u64 val = 0; + u32 val = 0; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; for (i = 0; i < 32; i++) { @@ -798,7 +798,7 @@ u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid) } void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val) + const u32 val) { int i; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h index 6082d4b66d39..5b490a4dfa5e 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.h +++ b/arch/arm64/kvm/vgic/vgic-mmio.h @@ -207,10 +207,10 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev, bool is_write, int offset, u32 *val); -u64 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); +u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid); void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, - const u64 val); + const u32 val); unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev); diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index c23118467a35..0c8da72953f0 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -249,7 +249,7 @@ int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr, bool is_write); int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write, - u32 intid, u64 *val); + u32 intid, u32 *val); int kvm_register_vgic_device(unsigned long type); void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr); From e1246f3f2df7aec025fd587ac3d7912007d1144d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Jul 2022 10:26:07 +0100 Subject: [PATCH 38/69] KVM: arm64: vgic-v3: Consolidate userspace access for MMIO registers For userspace accesses to GICv3 MMIO registers (and related data), vgic_v3_{get,set}_attr are littered with {get,put}_user() calls, making it hard to audit and reason about. Consolidate all userspace accesses in vgic_v3_attr_regs_access(), making the code far simpler to audit. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-kvm-device.c | 103 +++++++++----------------- 1 file changed, 37 insertions(+), 66 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index f02294b9aef1..e9db6795fb90 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -512,18 +512,18 @@ int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr, * * @dev: kvm device handle * @attr: kvm device attribute - * @reg: address the value is read or written, NULL for sysregs * @is_write: true if userspace is writing a register */ static int vgic_v3_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - u64 *reg, bool is_write) + bool is_write) { struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; + bool uaccess; + u32 val; int ret; - u32 tmp32; ret = vgic_v3_parse_attr(dev, attr, ®_attr); if (ret) @@ -532,6 +532,21 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, vcpu = reg_attr.vcpu; addr = reg_attr.addr; + switch (attr->group) { + case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: + /* Sysregs uaccess is performed by the sysreg handling code */ + uaccess = false; + break; + default: + uaccess = true; + } + + if (uaccess && is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + if (get_user(val, uaddr)) + return -EFAULT; + } + mutex_lock(&dev->kvm->lock); if (unlikely(!vgic_initialized(dev->kvm))) { @@ -546,20 +561,10 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; + ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - if (is_write) - tmp32 = *reg; - - ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &tmp32); - if (!is_write) - *reg = tmp32; + ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write); @@ -570,14 +575,10 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >> KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT; if (info == VGIC_LEVEL_INFO_LINE_LEVEL) { - if (is_write) - tmp32 = *reg; intid = attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK; ret = vgic_v3_line_level_info_uaccess(vcpu, is_write, - intid, &tmp32); - if (!is_write) - *reg = tmp32; + intid, &val); } else { ret = -EINVAL; } @@ -591,6 +592,12 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, unlock_all_vcpus(dev->kvm); out: mutex_unlock(&dev->kvm->lock); + + if (!ret && uaccess && !is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; + ret = put_user(val, uaddr); + } + return ret; } @@ -605,30 +612,12 @@ static int vgic_v3_set_attr(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 tmp32; - u64 reg; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + return vgic_v3_attr_regs_access(dev, attr, true); case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: - return vgic_v3_attr_regs_access(dev, attr, NULL, true); - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - if (get_user(tmp32, uaddr)) - return -EFAULT; - - reg = tmp32; - return vgic_v3_attr_regs_access(dev, attr, ®, true); - } + return vgic_v3_attr_regs_access(dev, attr, true); + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, true); case KVM_DEV_ARM_VGIC_GRP_CTRL: { int ret; @@ -662,30 +651,12 @@ static int vgic_v3_get_attr(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } + case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: + return vgic_v3_attr_regs_access(dev, attr, false); case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: - return vgic_v3_attr_regs_access(dev, attr, NULL, false); - case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u64 reg; - u32 tmp32; - - ret = vgic_v3_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - tmp32 = reg; - return put_user(tmp32, uaddr); - } + return vgic_v3_attr_regs_access(dev, attr, false); + case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: + return vgic_v3_attr_regs_access(dev, attr, false); } return -ENXIO; } From 7e9f723c2a90e41407d5889700169be4797a2009 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Jul 2022 10:26:07 +0100 Subject: [PATCH 39/69] KVM: arm64: vgic-v2: Consolidate userspace access for MMIO registers Align the GICv2 MMIO accesses from userspace with the way the GICv3 code is now structured. Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-kvm-device.c | 39 ++++++++++++--------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index e9db6795fb90..066b95d606fd 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -348,17 +348,18 @@ bool lock_all_vcpus(struct kvm *kvm) * * @dev: kvm device handle * @attr: kvm device attribute - * @reg: address the value is read or written * @is_write: true if userspace is writing a register */ static int vgic_v2_attr_regs_access(struct kvm_device *dev, struct kvm_device_attr *attr, - u32 *reg, bool is_write) + bool is_write) { + u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr; struct vgic_reg_attr reg_attr; gpa_t addr; struct kvm_vcpu *vcpu; int ret; + u32 val; ret = vgic_v2_parse_attr(dev, attr, ®_attr); if (ret) @@ -367,6 +368,10 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, vcpu = reg_attr.vcpu; addr = reg_attr.addr; + if (is_write) + if (get_user(val, uaddr)) + return -EFAULT; + mutex_lock(&dev->kvm->lock); ret = vgic_init(dev->kvm); @@ -380,10 +385,10 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: - ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, reg); + ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val); break; case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, reg); + ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, &val); break; default: ret = -EINVAL; @@ -393,6 +398,10 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, unlock_all_vcpus(dev->kvm); out: mutex_unlock(&dev->kvm->lock); + + if (!ret && !is_write) + ret = put_user(val, uaddr); + return ret; } @@ -407,15 +416,8 @@ static int vgic_v2_set_attr(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg; - - if (get_user(reg, uaddr)) - return -EFAULT; - - return vgic_v2_attr_regs_access(dev, attr, ®, true); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, true); } return -ENXIO; @@ -432,15 +434,8 @@ static int vgic_v2_get_attr(struct kvm_device *dev, switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: - case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: { - u32 __user *uaddr = (u32 __user *)(long)attr->addr; - u32 reg = 0; - - ret = vgic_v2_attr_regs_access(dev, attr, ®, false); - if (ret) - return ret; - return put_user(reg, uaddr); - } + case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: + return vgic_v2_attr_regs_access(dev, attr, false); } return -ENXIO; From d7df6f282db67677c06456fd29d47eda0ba060b9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Jul 2022 11:27:37 +0100 Subject: [PATCH 40/69] KVM: arm64: vgic: Use {get,put}_user() instead of copy_{from.to}_user Tidy-up vgic_get_common_attr() and vgic_set_common_attr() to use {get,put}_user() instead of the more complex (and less type-safe) copy_{from,to}_user(). Reviewed-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-kvm-device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 066b95d606fd..c17e5502c0b3 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -170,7 +170,7 @@ static int vgic_set_common_attr(struct kvm_device *dev, u64 addr; unsigned long type = (unsigned long)attr->attr; - if (copy_from_user(&addr, uaddr, sizeof(addr))) + if (get_user(addr, uaddr)) return -EFAULT; r = kvm_vgic_addr(dev->kvm, type, &addr, true); @@ -233,14 +233,14 @@ static int vgic_get_common_attr(struct kvm_device *dev, u64 addr; unsigned long type = (unsigned long)attr->attr; - if (copy_from_user(&addr, uaddr, sizeof(addr))) + if (get_user(addr, uaddr)) return -EFAULT; r = kvm_vgic_addr(dev->kvm, type, &addr, false); if (r) return (r == -ENODEV) ? -ENXIO : r; - if (copy_to_user(uaddr, &addr, sizeof(addr))) + if (put_user(addr, uaddr)) return -EFAULT; break; } From 9f968c9266aa30b0e81be0c6a560e45b93bed3dc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Jul 2022 14:34:33 +0100 Subject: [PATCH 41/69] KVM: arm64: vgic-v2: Add helper for legacy dist/cpuif base address setting We carry a legacy interface to set the base addresses for GICv2. As this is currently plumbed into the same handling code as the modern interface, it limits the evolution we can make there. Add a helper dedicated to this handling, with a view of maybe removing this in the future. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 11 ++------- arch/arm64/kvm/vgic/vgic-kvm-device.c | 32 +++++++++++++++++++++++++++ include/kvm/arm_vgic.h | 1 + 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a0188144a122..fd26beacbbe5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1414,18 +1414,11 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) { - unsigned long dev_id, type; - - dev_id = (dev_addr->id & KVM_ARM_DEVICE_ID_MASK) >> - KVM_ARM_DEVICE_ID_SHIFT; - type = (dev_addr->id & KVM_ARM_DEVICE_TYPE_MASK) >> - KVM_ARM_DEVICE_TYPE_SHIFT; - - switch (dev_id) { + switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) { case KVM_ARM_DEVICE_VGIC_V2: if (!vgic_present) return -ENXIO; - return kvm_vgic_addr(kvm, type, &dev_addr->addr, true); + return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr); default: return -ENODEV; } diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index c17e5502c0b3..04175fd55da6 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -41,6 +41,38 @@ static int vgic_check_type(struct kvm *kvm, int type_needed) return 0; } +int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr) +{ + struct vgic_dist *vgic = &kvm->arch.vgic; + int r; + + mutex_lock(&kvm->lock); + switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) { + case KVM_VGIC_V2_ADDR_TYPE_DIST: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_DIST_SIZE); + if (!r) + vgic->vgic_dist_base = dev_addr->addr; + break; + case KVM_VGIC_V2_ADDR_TYPE_CPU: + r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); + if (!r) + r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr, + SZ_4K, KVM_VGIC_V2_CPU_SIZE); + if (!r) + vgic->vgic_cpu_base = dev_addr->addr; + break; + default: + r = -ENODEV; + } + + mutex_unlock(&kvm->lock); + + return r; +} + /** * kvm_vgic_addr - set or get vgic VM base addresses * @kvm: pointer to the vm struct diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 2d8f2e90edc2..f79cce67563e 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -365,6 +365,7 @@ extern struct static_key_false vgic_v2_cpuif_trap; extern struct static_key_false vgic_v3_cpuif_trap; int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); +int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); int kvm_vgic_create(struct kvm *kvm, u32 type); From 4b85080f4e378f617f88964dec94fd282bcf2af4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 5 Jul 2022 14:39:24 +0100 Subject: [PATCH 42/69] KVM: arm64: vgic: Consolidate userspace access for base address setting Align kvm_vgic_addr() with the rest of the code by moving the userspace accesses into it. kvm_vgic_addr() is also made static. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-kvm-device.c | 75 +++++++++++---------------- include/kvm/arm_vgic.h | 1 - 2 files changed, 31 insertions(+), 45 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 04175fd55da6..011171dc41c5 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -76,8 +76,7 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev /** * kvm_vgic_addr - set or get vgic VM base addresses * @kvm: pointer to the vm struct - * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX - * @addr: pointer to address value + * @attr: pointer to the attribute being retrieved/updated * @write: if true set the address in the VM address space, if false read the * address * @@ -89,15 +88,22 @@ int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev * overlapping regions in case of a virtual GICv3 here, since we don't know * the number of VCPUs yet, so we defer this check to map_resources(). */ -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) +static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write) { - int r = 0; + u64 __user *uaddr = (u64 __user *)attr->addr; struct vgic_dist *vgic = &kvm->arch.vgic; phys_addr_t *addr_ptr, alignment, size; u64 undef_value = VGIC_ADDR_UNDEF; + u64 addr; + int r; + + /* Reading a redistributor region addr implies getting the index */ + if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION) + if (get_user(addr, uaddr)) + return -EFAULT; mutex_lock(&kvm->lock); - switch (type) { + switch (attr->attr) { case KVM_VGIC_V2_ADDR_TYPE_DIST: r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2); addr_ptr = &vgic->vgic_dist_base; @@ -123,7 +129,7 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) if (r) break; if (write) { - r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); + r = vgic_v3_set_redist_base(kvm, 0, addr, 0); goto out; } rdreg = list_first_entry_or_null(&vgic->rd_regions, @@ -143,14 +149,12 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) if (r) break; - index = *addr & KVM_VGIC_V3_RDIST_INDEX_MASK; + index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK; if (write) { - gpa_t base = *addr & KVM_VGIC_V3_RDIST_BASE_MASK; - u32 count = (*addr & KVM_VGIC_V3_RDIST_COUNT_MASK) - >> KVM_VGIC_V3_RDIST_COUNT_SHIFT; - u8 flags = (*addr & KVM_VGIC_V3_RDIST_FLAGS_MASK) - >> KVM_VGIC_V3_RDIST_FLAGS_SHIFT; + gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK; + u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr); + u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr); if (!count || flags) r = -EINVAL; @@ -166,9 +170,9 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; } - *addr = index; - *addr |= rdreg->base; - *addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; + addr = index; + addr |= rdreg->base; + addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT; goto out; } default: @@ -179,15 +183,19 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write) goto out; if (write) { - r = vgic_check_iorange(kvm, *addr_ptr, *addr, alignment, size); + r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size); if (!r) - *addr_ptr = *addr; + *addr_ptr = addr; } else { - *addr = *addr_ptr; + addr = *addr_ptr; } out: mutex_unlock(&kvm->lock); + + if (!r && !write) + r = put_user(addr, uaddr); + return r; } @@ -197,17 +205,9 @@ static int vgic_set_common_attr(struct kvm_device *dev, int r; switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (get_user(addr, uaddr)) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, true); + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, true); return (r == -ENODEV) ? -ENXIO : r; - } case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; u32 val; @@ -260,22 +260,9 @@ static int vgic_get_common_attr(struct kvm_device *dev, int r = -ENXIO; switch (attr->group) { - case KVM_DEV_ARM_VGIC_GRP_ADDR: { - u64 __user *uaddr = (u64 __user *)(long)attr->addr; - u64 addr; - unsigned long type = (unsigned long)attr->attr; - - if (get_user(addr, uaddr)) - return -EFAULT; - - r = kvm_vgic_addr(dev->kvm, type, &addr, false); - if (r) - return (r == -ENODEV) ? -ENXIO : r; - - if (put_user(addr, uaddr)) - return -EFAULT; - break; - } + case KVM_DEV_ARM_VGIC_GRP_ADDR: + r = kvm_vgic_addr(dev->kvm, attr, false); + return (r == -ENODEV) ? -ENXIO : r; case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: { u32 __user *uaddr = (u32 __user *)(long)attr->addr; diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index f79cce67563e..4df9e73a8bb5 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -364,7 +364,6 @@ struct vgic_cpu { extern struct static_key_false vgic_v2_cpuif_trap; extern struct static_key_false vgic_v3_cpuif_trap; -int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr); void kvm_vgic_early_init(struct kvm *kvm); int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); From 619064afa9b6f0088b86a1fed20c049cfe94cdf7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 14 Jul 2022 08:10:09 +0100 Subject: [PATCH 43/69] KVM: arm64: vgic: Tidy-up calls to vgic_{get,set}_common_attr() The userspace accessors have an early call to vgic_{get,set}_common_attr() that makes the code hard to follow. Move it to the default: clause of the decoding switch statement, which results in a nice cleanup. This requires us to move the handling of the pending table into the common handling, even if it is strictly a GICv3 feature (it has the benefit of keeping the whole control group handling in the same function). Also cleanup vgic_v3_{get,set}_attr() while we're at it, deduplicating the calls to vgic_v3_attr_regs_access(). Suggested-by: Reiji Watanabe Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-kvm-device.c | 78 +++++++++------------------ 1 file changed, 26 insertions(+), 52 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 011171dc41c5..edeac2380591 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -246,6 +246,24 @@ static int vgic_set_common_attr(struct kvm_device *dev, r = vgic_init(dev->kvm); mutex_unlock(&dev->kvm->lock); return r; + case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: + /* + * OK, this one isn't common at all, but we + * want to handle all control group attributes + * in a single place. + */ + if (vgic_check_type(dev->kvm, KVM_DEV_TYPE_ARM_VGIC_V3)) + return -ENXIO; + mutex_lock(&dev->kvm->lock); + + if (!lock_all_vcpus(dev->kvm)) { + mutex_unlock(&dev->kvm->lock); + return -EBUSY; + } + r = vgic_v3_save_pending_tables(dev->kvm); + unlock_all_vcpus(dev->kvm); + mutex_unlock(&dev->kvm->lock); + return r; } break; } @@ -427,37 +445,25 @@ static int vgic_v2_attr_regs_access(struct kvm_device *dev, static int vgic_v2_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_attr_regs_access(dev, attr, true); + default: + return vgic_set_common_attr(dev, attr); } - - return -ENXIO; } static int vgic_v2_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: return vgic_v2_attr_regs_access(dev, attr, false); + default: + return vgic_get_common_attr(dev, attr); } - - return -ENXIO; } static int vgic_v2_has_attr(struct kvm_device *dev, @@ -618,61 +624,29 @@ static int vgic_v3_attr_regs_access(struct kvm_device *dev, static int vgic_v3_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_set_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - return vgic_v3_attr_regs_access(dev, attr, true); case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: - return vgic_v3_attr_regs_access(dev, attr, true); case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, true); - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - int ret; - - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES: - mutex_lock(&dev->kvm->lock); - - if (!lock_all_vcpus(dev->kvm)) { - mutex_unlock(&dev->kvm->lock); - return -EBUSY; - } - ret = vgic_v3_save_pending_tables(dev->kvm); - unlock_all_vcpus(dev->kvm); - mutex_unlock(&dev->kvm->lock); - return ret; - } - break; + default: + return vgic_set_common_attr(dev, attr); } - } - return -ENXIO; } static int vgic_v3_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { - int ret; - - ret = vgic_get_common_attr(dev, attr); - if (ret != -ENXIO) - return ret; - switch (attr->group) { case KVM_DEV_ARM_VGIC_GRP_DIST_REGS: case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS: - return vgic_v3_attr_regs_access(dev, attr, false); case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS: - return vgic_v3_attr_regs_access(dev, attr, false); case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: return vgic_v3_attr_regs_access(dev, attr, false); + default: + return vgic_get_common_attr(dev, attr); } - return -ENXIO; } static int vgic_v3_has_attr(struct kvm_device *dev, From f6dddbb25572218d2e8ab93bfdad20cddeb99b5a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 10:03:33 +0100 Subject: [PATCH 44/69] KVM: arm64: Get rid of find_reg_by_id() This helper doesn't have a user anymore, let's get rid of it. Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 11 ----------- arch/arm64/kvm/sys_regs.h | 5 ----- 2 files changed, 16 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 379478eecfaa..7ab67a7fc0d8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2577,17 +2577,6 @@ static bool index_to_params(u64 id, struct sys_reg_params *params) } } -const struct sys_reg_desc *find_reg_by_id(u64 id, - struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) -{ - if (!index_to_params(id, params)) - return NULL; - - return find_reg(params, table, num); -} - const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index b8b576a2af2b..49517f58deb5 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -190,11 +190,6 @@ find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } -const struct sys_reg_desc *find_reg_by_id(u64 id, - struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num); - const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num); From c5332898dc35bbed7d3aa02b491e3388315ee481 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 18:25:41 +0100 Subject: [PATCH 45/69] KVM: arm64: Descope kvm_arm_sys_reg_{get,set}_reg() Having kvm_arm_sys_reg_get_reg and co in kvm_host.h gives the impression that these functions are free to be called from anywhere. Not quite. They really are tied to out internal sysreg handling, and they would be better off in the sys_regs.h header, which is private. kvm_host.h could also get a bit of a diet, so let's just do that. Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 -- arch/arm64/kvm/sys_regs.h | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index de32152cea04..0c9c85981a8e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -620,8 +620,6 @@ int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu); int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 49517f58deb5..a8c4cc32eb9a 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -194,9 +194,10 @@ const struct sys_reg_desc *get_reg_by_id(u64 id, const struct sys_reg_desc table[], unsigned int num); +int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); +int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); - int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, const struct sys_reg_desc table[], unsigned int num); From 4274d42716d87d5301fdf67eb799e7db08fe73de Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 4 Jul 2022 18:11:04 +0100 Subject: [PATCH 46/69] KVM: arm64: Get rid or outdated comments Once apon a time, the 32bit KVM/arm port was the reference, while the arm64 version was the new kid on the block, without a clear future... This was a long time ago. "The times, they are a-changing." Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 7ab67a7fc0d8..b4fda04413f2 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -34,11 +34,6 @@ #include "trace.h" /* - * All of this file is extremely similar to the ARM coproc.c, but the - * types are different. My gut feeling is that it should be pretty - * easy to merge, but that would be an ABI breakage -- again. VFP - * would also need to be abstracted. - * * For AArch32, we only take care of what is being trapped. Anything * that has to do with init and userspace access has to go via the * 64bit interface. From 6bf212c89c48458d8deef1c973678c62528dab04 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:34 -0700 Subject: [PATCH 47/69] arm64: stacktrace: Add shared header for common stack unwinding code In order to reuse the arm64 stack unwinding logic for the nVHE hypervisor stack, move the common code to a shared header (arch/arm64/include/asm/stacktrace/common.h). The nVHE hypervisor cannot safely link against kernel code, so we make use of the shared header to avoid duplicated logic later in this series. Signed-off-by: Kalesh Singh Reviewed-by: Mark Brown Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-2-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace.h | 35 +------ arch/arm64/include/asm/stacktrace/common.h | 105 +++++++++++++++++++++ arch/arm64/kernel/stacktrace.c | 57 ----------- 3 files changed, 106 insertions(+), 91 deletions(-) create mode 100644 arch/arm64/include/asm/stacktrace/common.h diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index aec9315bf156..79f455b37c84 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -8,52 +8,19 @@ #include #include #include -#include #include #include #include #include -enum stack_type { - STACK_TYPE_UNKNOWN, - STACK_TYPE_TASK, - STACK_TYPE_IRQ, - STACK_TYPE_OVERFLOW, - STACK_TYPE_SDEI_NORMAL, - STACK_TYPE_SDEI_CRITICAL, - __NR_STACK_TYPES -}; - -struct stack_info { - unsigned long low; - unsigned long high; - enum stack_type type; -}; +#include extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); -static inline bool on_stack(unsigned long sp, unsigned long size, - unsigned long low, unsigned long high, - enum stack_type type, struct stack_info *info) -{ - if (!low) - return false; - - if (sp < low || sp + size < sp || sp + size > high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = type; - } - return true; -} - static inline bool on_irq_stack(unsigned long sp, unsigned long size, struct stack_info *info) { diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h new file mode 100644 index 000000000000..64ae4f6b06fe --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common arm64 stack unwinder code. + * + * Copyright (C) 2012 ARM Ltd. + */ +#ifndef __ASM_STACKTRACE_COMMON_H +#define __ASM_STACKTRACE_COMMON_H + +#include +#include +#include + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_OVERFLOW, + STACK_TYPE_SDEI_NORMAL, + STACK_TYPE_SDEI_CRITICAL, + __NR_STACK_TYPES +}; + +struct stack_info { + unsigned long low; + unsigned long high; + enum stack_type type; +}; + +/* + * A snapshot of a frame record or fp/lr register values, along with some + * accounting information necessary for robust unwinding. + * + * @fp: The fp value in the frame record (or the real fp) + * @pc: The lr value in the frame record (or the real lr) + * + * @stacks_done: Stacks which have been entirely unwound, for which it is no + * longer valid to unwind to. + * + * @prev_fp: The fp that pointed to this frame record, or a synthetic value + * of 0. This is used to ensure that within a stack, each + * subsequent frame record is at an increasing address. + * @prev_type: The type of stack this frame record was on, or a synthetic + * value of STACK_TYPE_UNKNOWN. This is used to detect a + * transition from one stack to another. + * + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. + * + * @task: The task being unwound. + */ +struct unwind_state { + unsigned long fp; + unsigned long pc; + DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); + unsigned long prev_fp; + enum stack_type prev_type; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif + struct task_struct *task; +}; + +static inline bool on_stack(unsigned long sp, unsigned long size, + unsigned long low, unsigned long high, + enum stack_type type, struct stack_info *info) +{ + if (!low) + return false; + + if (sp < low || sp + size < sp || sp + size > high) + return false; + + if (info) { + info->low = low; + info->high = high; + info->type = type; + } + return true; +} + +static inline void unwind_init_common(struct unwind_state *state, + struct task_struct *task) +{ + state->task = task; +#ifdef CONFIG_KRETPROBES + state->kr_cur = NULL; +#endif + + /* + * Prime the first unwind. + * + * In unwind_next() we'll check that the FP points to a valid stack, + * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be + * treated as a transition to whichever stack that happens to be. The + * prev_fp value won't be used, but we set it to 0 such that it is + * definitely not an accessible stack address. + */ + bitmap_zero(state->stacks_done, __NR_STACK_TYPES); + state->prev_fp = 0; + state->prev_type = STACK_TYPE_UNKNOWN; +} + +#endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index fcaa151b81f1..94a5dd2ab8fd 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -18,63 +18,6 @@ #include #include -/* - * A snapshot of a frame record or fp/lr register values, along with some - * accounting information necessary for robust unwinding. - * - * @fp: The fp value in the frame record (or the real fp) - * @pc: The lr value in the frame record (or the real lr) - * - * @stacks_done: Stacks which have been entirely unwound, for which it is no - * longer valid to unwind to. - * - * @prev_fp: The fp that pointed to this frame record, or a synthetic value - * of 0. This is used to ensure that within a stack, each - * subsequent frame record is at an increasing address. - * @prev_type: The type of stack this frame record was on, or a synthetic - * value of STACK_TYPE_UNKNOWN. This is used to detect a - * transition from one stack to another. - * - * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance - * associated with the most recently encountered replacement lr - * value. - * - * @task: The task being unwound. - */ -struct unwind_state { - unsigned long fp; - unsigned long pc; - DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); - unsigned long prev_fp; - enum stack_type prev_type; -#ifdef CONFIG_KRETPROBES - struct llist_node *kr_cur; -#endif - struct task_struct *task; -}; - -static void unwind_init_common(struct unwind_state *state, - struct task_struct *task) -{ - state->task = task; -#ifdef CONFIG_KRETPROBES - state->kr_cur = NULL; -#endif - - /* - * Prime the first unwind. - * - * In unwind_next() we'll check that the FP points to a valid stack, - * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be - * treated as a transition to whichever stack that happens to be. The - * prev_fp value won't be used, but we set it to 0 such that it is - * definitely not an accessible stack address. - */ - bitmap_zero(state->stacks_done, __NR_STACK_TYPES); - state->prev_fp = 0; - state->prev_type = STACK_TYPE_UNKNOWN; -} - /* * Start an unwind from a pt_regs. * From 15a59f19a015185bff90a68f601caec151dea4b4 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:35 -0700 Subject: [PATCH 48/69] arm64: stacktrace: Factor out on_accessible_stack_common() Move common on_accessible_stack checks to stacktrace/common.h. This is used in the implementation of the nVHE hypervisor unwinder later in this series. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Reviewed-by: Mark Brown Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-3-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace.h | 6 ++---- arch/arm64/include/asm/stacktrace/common.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 79f455b37c84..43f4b4a6d383 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -65,8 +65,8 @@ static inline bool on_accessible_stack(const struct task_struct *tsk, unsigned long sp, unsigned long size, struct stack_info *info) { - if (info) - info->type = STACK_TYPE_UNKNOWN; + if (on_accessible_stack_common(tsk, sp, size, info)) + return true; if (on_task_stack(tsk, sp, size, info)) return true; @@ -74,8 +74,6 @@ static inline bool on_accessible_stack(const struct task_struct *tsk, return false; if (on_irq_stack(sp, size, info)) return true; - if (on_overflow_stack(sp, size, info)) - return true; if (on_sdei_stack(sp, size, info)) return true; diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 64ae4f6b06fe..f58b786460d3 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -62,6 +62,9 @@ struct unwind_state { struct task_struct *task; }; +static inline bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info); + static inline bool on_stack(unsigned long sp, unsigned long size, unsigned long low, unsigned long high, enum stack_type type, struct stack_info *info) @@ -80,6 +83,21 @@ static inline bool on_stack(unsigned long sp, unsigned long size, return true; } +static inline bool on_accessible_stack_common(const struct task_struct *tsk, + unsigned long sp, + unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + /* + * Both the kernel and nvhe hypervisor make use of + * an overflow_stack + */ + return on_overflow_stack(sp, size, info); +} + static inline void unwind_init_common(struct unwind_state *state, struct task_struct *task) { From be63c647fd28d25484257f5f36a008db7d99991d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:36 -0700 Subject: [PATCH 49/69] arm64: stacktrace: Factor out unwind_next_common() Move common unwind_next logic to stacktrace/common.h. This allows reusing the code in the implementation the nVHE hypervisor stack unwinder, later in this series. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Reviewed-by: Mark Brown Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-4-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/common.h | 50 ++++++++++++++++++++++ arch/arm64/kernel/stacktrace.c | 41 ++---------------- 2 files changed, 54 insertions(+), 37 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index f58b786460d3..0c5cbfdb56b5 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -65,6 +65,10 @@ struct unwind_state { static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info); +static inline bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info); + static inline bool on_stack(unsigned long sp, unsigned long size, unsigned long low, unsigned long high, enum stack_type type, struct stack_info *info) @@ -120,4 +124,50 @@ static inline void unwind_init_common(struct unwind_state *state, state->prev_type = STACK_TYPE_UNKNOWN; } +static inline int unwind_next_common(struct unwind_state *state, + struct stack_info *info) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->fp; + + if (fp & 0x7) + return -EINVAL; + + if (!on_accessible_stack(tsk, fp, 16, info)) + return -EINVAL; + + if (test_bit(info->type, state->stacks_done)) + return -EINVAL; + + /* + * As stacks grow downward, any valid record on the same stack must be + * at a strictly higher address than the prior record. + * + * Stacks can nest in several valid orders, e.g. + * + * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL + * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW + * + * ... but the nesting itself is strict. Once we transition from one + * stack to another, it's never valid to unwind back to that first + * stack. + */ + if (info->type == state->prev_type) { + if (fp <= state->prev_fp) + return -EINVAL; + } else { + __set_bit(state->prev_type, state->stacks_done); + } + + /* + * Record this frame record's values and location. The prev_fp and + * prev_type are only meaningful to the next unwind_next() invocation. + */ + state->fp = READ_ONCE(*(unsigned long *)(fp)); + state->pc = READ_ONCE(*(unsigned long *)(fp + 8)); + state->prev_fp = fp; + state->prev_type = info->type; + + return 0; +} #endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 94a5dd2ab8fd..834851939364 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -81,48 +81,15 @@ static int notrace unwind_next(struct unwind_state *state) struct task_struct *tsk = state->task; unsigned long fp = state->fp; struct stack_info info; + int err; /* Final frame; nothing to unwind */ if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) return -ENOENT; - if (fp & 0x7) - return -EINVAL; - - if (!on_accessible_stack(tsk, fp, 16, &info)) - return -EINVAL; - - if (test_bit(info.type, state->stacks_done)) - return -EINVAL; - - /* - * As stacks grow downward, any valid record on the same stack must be - * at a strictly higher address than the prior record. - * - * Stacks can nest in several valid orders, e.g. - * - * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL - * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW - * - * ... but the nesting itself is strict. Once we transition from one - * stack to another, it's never valid to unwind back to that first - * stack. - */ - if (info.type == state->prev_type) { - if (fp <= state->prev_fp) - return -EINVAL; - } else { - __set_bit(state->prev_type, state->stacks_done); - } - - /* - * Record this frame record's values and location. The prev_fp and - * prev_type are only meaningful to the next unwind_next() invocation. - */ - state->fp = READ_ONCE(*(unsigned long *)(fp)); - state->pc = READ_ONCE(*(unsigned long *)(fp + 8)); - state->prev_fp = fp; - state->prev_type = info.type; + err = unwind_next_common(state, &info); + if (err) + return err; state->pc = ptrauth_strip_insn_pac(state->pc); From 5b1b08619f50422c3e43d1fd7af257595a9e4a67 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:37 -0700 Subject: [PATCH 50/69] arm64: stacktrace: Handle frame pointer from different address spaces The unwinder code is made reusable so that it can be used to unwind various types of stacks. One usecase is unwinding the nVHE hyp stack from the host (EL1) in non-protected mode. This means that the unwinder must be able to translate HYP stack addresses to kernel addresses. Add a callback (stack_trace_translate_fp_fn) to allow specifying the translation function. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-5-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/common.h | 29 +++++++++++++++++++--- arch/arm64/kernel/stacktrace.c | 2 +- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 0c5cbfdb56b5..b241edba5c76 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -124,11 +124,25 @@ static inline void unwind_init_common(struct unwind_state *state, state->prev_type = STACK_TYPE_UNKNOWN; } +/* + * stack_trace_translate_fp_fn() - Translates a non-kernel frame pointer to + * a kernel address. + * + * @fp: the frame pointer to be updated to its kernel address. + * @type: the stack type associated with frame pointer @fp + * + * Returns true and success and @fp is updated to the corresponding + * kernel virtual address; otherwise returns false. + */ +typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp, + enum stack_type type); + static inline int unwind_next_common(struct unwind_state *state, - struct stack_info *info) + struct stack_info *info, + stack_trace_translate_fp_fn translate_fp) { + unsigned long fp = state->fp, kern_fp = fp; struct task_struct *tsk = state->task; - unsigned long fp = state->fp; if (fp & 0x7) return -EINVAL; @@ -139,6 +153,13 @@ static inline int unwind_next_common(struct unwind_state *state, if (test_bit(info->type, state->stacks_done)) return -EINVAL; + /* + * If fp is not from the current address space perform the necessary + * translation before dereferencing it to get the next fp. + */ + if (translate_fp && !translate_fp(&kern_fp, info->type)) + return -EINVAL; + /* * As stacks grow downward, any valid record on the same stack must be * at a strictly higher address than the prior record. @@ -163,8 +184,8 @@ static inline int unwind_next_common(struct unwind_state *state, * Record this frame record's values and location. The prev_fp and * prev_type are only meaningful to the next unwind_next() invocation. */ - state->fp = READ_ONCE(*(unsigned long *)(fp)); - state->pc = READ_ONCE(*(unsigned long *)(fp + 8)); + state->fp = READ_ONCE(*(unsigned long *)(kern_fp)); + state->pc = READ_ONCE(*(unsigned long *)(kern_fp + 8)); state->prev_fp = fp; state->prev_type = info->type; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 834851939364..eef3cf6bf2d7 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -87,7 +87,7 @@ static int notrace unwind_next(struct unwind_state *state) if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) return -ENOENT; - err = unwind_next_common(state, &info); + err = unwind_next_common(state, &info, NULL); if (err) return err; From f51e7146740514347d6c5526a2c393e224a19c0d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:38 -0700 Subject: [PATCH 51/69] arm64: stacktrace: Factor out common unwind() Move unwind() to stacktrace/common.h, and as a result the kernel unwind_next() to asm/stacktrace.h. This allow reusing unwind() in the implementation of the nVHE HYP stack unwinder, later in the series. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Reviewed-by: Mark Brown Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-6-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace.h | 51 ++++++++++++++++ arch/arm64/include/asm/stacktrace/common.h | 19 ++++++ arch/arm64/kernel/stacktrace.c | 67 ---------------------- 3 files changed, 70 insertions(+), 67 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 43f4b4a6d383..ea828579a98b 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -80,4 +81,54 @@ static inline bool on_accessible_stack(const struct task_struct *tsk, return false; } +/* + * Unwind from one frame record (A) to the next frame record (B). + * + * We terminate early if the location of B indicates a malformed chain of frame + * records (e.g. a cycle), determined based on the location and fp value of A + * and the location (but not the fp value) of B. + */ +static inline int notrace unwind_next(struct unwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->fp; + struct stack_info info; + int err; + + /* Final frame; nothing to unwind */ + if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) + return -ENOENT; + + err = unwind_next_common(state, &info, NULL); + if (err) + return err; + + state->pc = ptrauth_strip_insn_pac(state->pc); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (tsk->ret_stack && + (state->pc == (unsigned long)return_to_handler)) { + unsigned long orig_pc; + /* + * This is a case where function graph tracer has + * modified a return address (LR) in a stack frame + * to hook a function return. + * So replace it to an original value. + */ + orig_pc = ftrace_graph_ret_addr(tsk, NULL, state->pc, + (void *)state->fp); + if (WARN_ON_ONCE(state->pc == orig_pc)) + return -EINVAL; + state->pc = orig_pc; + } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(state->pc)) + state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur); +#endif + + return 0; +} +NOKPROBE_SYMBOL(unwind_next); + #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index b241edba5c76..4b632141d91c 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -9,6 +9,7 @@ #include #include +#include #include enum stack_type { @@ -69,6 +70,8 @@ static inline bool on_accessible_stack(const struct task_struct *tsk, unsigned long sp, unsigned long size, struct stack_info *info); +static inline int unwind_next(struct unwind_state *state); + static inline bool on_stack(unsigned long sp, unsigned long size, unsigned long low, unsigned long high, enum stack_type type, struct stack_info *info) @@ -191,4 +194,20 @@ static inline int unwind_next_common(struct unwind_state *state, return 0; } + +static inline void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, + void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} +NOKPROBE_SYMBOL(unwind); #endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index eef3cf6bf2d7..9fa60ee48499 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -7,14 +7,12 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include @@ -69,71 +67,6 @@ static inline void unwind_init_from_task(struct unwind_state *state, state->pc = thread_saved_pc(task); } -/* - * Unwind from one frame record (A) to the next frame record (B). - * - * We terminate early if the location of B indicates a malformed chain of frame - * records (e.g. a cycle), determined based on the location and fp value of A - * and the location (but not the fp value) of B. - */ -static int notrace unwind_next(struct unwind_state *state) -{ - struct task_struct *tsk = state->task; - unsigned long fp = state->fp; - struct stack_info info; - int err; - - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) - return -ENOENT; - - err = unwind_next_common(state, &info, NULL); - if (err) - return err; - - state->pc = ptrauth_strip_insn_pac(state->pc); - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (tsk->ret_stack && - (state->pc == (unsigned long)return_to_handler)) { - unsigned long orig_pc; - /* - * This is a case where function graph tracer has - * modified a return address (LR) in a stack frame - * to hook a function return. - * So replace it to an original value. - */ - orig_pc = ftrace_graph_ret_addr(tsk, NULL, state->pc, - (void *)state->fp); - if (WARN_ON_ONCE(state->pc == orig_pc)) - return -EINVAL; - state->pc = orig_pc; - } -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -#ifdef CONFIG_KRETPROBES - if (is_kretprobe_trampoline(state->pc)) - state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur); -#endif - - return 0; -} -NOKPROBE_SYMBOL(unwind_next); - -static void notrace unwind(struct unwind_state *state, - stack_trace_consume_fn consume_entry, void *cookie) -{ - while (1) { - int ret; - - if (!consume_entry(cookie, state->pc)) - break; - ret = unwind_next(state); - if (ret < 0) - break; - } -} -NOKPROBE_SYMBOL(unwind); - static bool dump_backtrace_entry(void *arg, unsigned long where) { char *loglvl = arg; From 051ece6758cc10c2a6f1700ffe86d23fbb0b2553 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:39 -0700 Subject: [PATCH 52/69] arm64: stacktrace: Add description of stacktrace/common.h Add brief description on how to use stacktrace/common.h to implement a stack unwinder. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-7-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/common.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 4b632141d91c..45474b383630 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -2,6 +2,21 @@ /* * Common arm64 stack unwinder code. * + * To implement a new arm64 stack unwinder: + * 1) Include this header + * + * 2) Provide implementations for the following functions: + * on_overflow_stack(): Returns true if SP is on the overflow + * stack. + * on_accessible_stack(): Returns true is SP is on any accessible + * stack. + * unwind_next(): Performs validation checks on the frame + * pointer, and transitions unwind_state + * to the next frame. + * + * See: arch/arm64/include/asm/stacktrace.h for reference + * implementations. + * * Copyright (C) 2012 ARM Ltd. */ #ifndef __ASM_STACKTRACE_COMMON_H From 548ec3336f323db56260b312c232ab37285f0284 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:40 -0700 Subject: [PATCH 53/69] KVM: arm64: On stack overflow switch to hyp overflow_stack On hyp stack overflow switch to 16-byte aligned secondary stack. This provides us stack space to better handle overflows; and is used in a subsequent patch to dump the hypervisor stacktrace. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-8-kaleshsingh@google.com --- arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/hyp/nvhe/host.S | 9 ++------- arch/arm64/kvm/hyp/nvhe/stacktrace.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 8 deletions(-) create mode 100644 arch/arm64/kvm/hyp/nvhe/stacktrace.c diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index f9fe4dc21b1f..524e7dad5739 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs)) obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o obj-$(CONFIG_DEBUG_LIST) += list_debug.o diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index ea6a397b64a6..b6c0188c4b35 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -177,13 +177,8 @@ SYM_FUNC_END(__host_hvc) b hyp_panic .L__hyp_sp_overflow\@: - /* - * Reset SP to the top of the stack, to allow handling the hyp_panic. - * This corrupts the stack but is ok, since we won't be attempting - * any unwinding here. - */ - ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1 - mov sp, x0 + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 b hyp_panic_bad_stack ASM_BUG() diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c new file mode 100644 index 000000000000..a3d5b34e1249 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM nVHE hypervisor stack tracing support. + * + * Copyright (C) 2022 Google LLC + */ +#include +#include + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); From 573e1e8275f7167ddd533c6e4e0f500f8be4d974 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:41 -0700 Subject: [PATCH 54/69] KVM: arm64: Stub implementation of non-protected nVHE HYP stack unwinder Add stub implementations of non-protected nVHE stack unwinder, for building. These are implemented later in this series. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-9-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/nvhe.h | 47 ++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 arch/arm64/include/asm/stacktrace/nvhe.h diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h new file mode 100644 index 000000000000..1192ae0f80c1 --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * Copyright (C) 2022 Google LLC + */ +#ifndef __ASM_STACKTRACE_NVHE_H +#define __ASM_STACKTRACE_NVHE_H + +#include + +static inline bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + return false; +} + +#ifndef __KVM_NVHE_HYPERVISOR__ +/* + * Conventional (non-protected) nVHE HYP stack unwinder + * + * In non-protected mode, the unwinding is done from kernel proper context + * (by the host in EL1). + */ + +static inline bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + return false; +} + +static inline int notrace unwind_next(struct unwind_state *state) +{ + return 0; +} +NOKPROBE_SYMBOL(unwind_next); + +#endif /* !__KVM_NVHE_HYPERVISOR__ */ +#endif /* __ASM_STACKTRACE_NVHE_H */ From 879e5ac7b2e4db05799a905b5a07fc9e5dedf651 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:42 -0700 Subject: [PATCH 55/69] KVM: arm64: Prepare non-protected nVHE hypervisor stacktrace In non-protected nVHE mode (non-pKVM) the host can directly access hypervisor memory; and unwinding of the hypervisor stacktrace is done from EL1 to save on memory for shared buffers. To unwind the hypervisor stack from EL1 the host needs to know the starting point for the unwind and information that will allow it to translate hypervisor stack addresses to the corresponding kernel addresses. This patch sets up this book keeping. It is made use of later in the series. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-10-kaleshsingh@google.com --- arch/arm64/include/asm/kvm_asm.h | 16 +++++++++++ arch/arm64/kvm/hyp/nvhe/stacktrace.c | 41 ++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 6 ++++ 3 files changed, 63 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2e277f2ed671..53035763e48e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -176,6 +176,22 @@ struct kvm_nvhe_init_params { unsigned long vtcr; }; +/* + * Used by the host in EL1 to dump the nVHE hypervisor backtrace on + * hyp_panic() in non-protected mode. + * + * @stack_base: hyp VA of the hyp_stack base. + * @overflow_stack_base: hyp VA of the hyp_overflow_stack base. + * @fp: hyp FP where the backtrace begins. + * @pc: hyp PC where the backtrace begins. + */ +struct kvm_nvhe_stacktrace_info { + unsigned long stack_base; + unsigned long overflow_stack_base; + unsigned long fp; + unsigned long pc; +}; + /* Translate a kernel address @ptr into its equivalent linear mapping */ #define kvm_ksym_ref(ptr) \ ({ \ diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index a3d5b34e1249..b8a280aa026a 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -4,8 +4,49 @@ * * Copyright (C) 2022 Google LLC */ +#include +#include #include #include DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); + +DEFINE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); + +/* + * hyp_prepare_backtrace - Prepare non-protected nVHE backtrace. + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the information needed by the host to unwind the non-protected + * nVHE hypervisor stack in EL1. + */ +static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); + stacktrace_info->fp = fp; + stacktrace_info->pc = pc; +} + +/* + * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Saves the information needed by the host to dump the nVHE hypervisor + * backtrace. + */ +void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + if (is_protected_kvm_enabled()) + return; + else + hyp_prepare_backtrace(fp, pc); +} diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 6db801db8f27..64e13445d0d9 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -34,6 +34,8 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -375,6 +377,10 @@ asmlinkage void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } + /* Prepare to dump kvm nvhe hyp stacktrace */ + kvm_nvhe_prepare_backtrace((unsigned long)__builtin_frame_address(0), + _THIS_IP_); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } From db129d486ebdf4e3168282236f9d9008b42cac7e Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:43 -0700 Subject: [PATCH 56/69] KVM: arm64: Implement non-protected nVHE hyp stack unwinder Implements the common framework necessary for unwind() to work for non-protected nVHE mode: - on_accessible_stack() - on_overflow_stack() - unwind_next() Non-protected nVHE unwind() is used to unwind and dump the hypervisor stacktrace by the host in EL1 Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-11-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/common.h | 2 + arch/arm64/include/asm/stacktrace/nvhe.h | 76 +++++++++++++++++++++- arch/arm64/kvm/arm.c | 2 +- 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 45474b383630..3ebb69ea374a 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -34,6 +34,7 @@ enum stack_type { STACK_TYPE_OVERFLOW, STACK_TYPE_SDEI_NORMAL, STACK_TYPE_SDEI_CRITICAL, + STACK_TYPE_HYP, __NR_STACK_TYPES }; @@ -186,6 +187,7 @@ static inline int unwind_next_common(struct unwind_state *state, * * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW + * HYP -> OVERFLOW * * ... but the nesting itself is strict. Once we transition from one * stack to another, it's never valid to unwind back to that first diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 1192ae0f80c1..21082fd4a0b7 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -16,10 +16,19 @@ #include +static inline bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info); + static inline bool on_accessible_stack(const struct task_struct *tsk, unsigned long sp, unsigned long size, struct stack_info *info) { + if (on_accessible_stack_common(tsk, sp, size, info)) + return true; + + if (on_hyp_stack(sp, size, info)) + return true; + return false; } @@ -31,15 +40,78 @@ static inline bool on_accessible_stack(const struct task_struct *tsk, * (by the host in EL1). */ +DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); +DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +/* + * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs + * + * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to + * allow for guard pages below the stack. Consequently, the fixed offset address + * translation macros won't work here. + * + * The kernel VA is calculated as an offset from the kernel VA of the hypervisor + * stack base. + * + * Returns true on success and updates @addr to its corresponding kernel VA; + * otherwise returns false. + */ +static inline bool kvm_nvhe_stack_kern_va(unsigned long *addr, + enum stack_type type) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + unsigned long hyp_base, kern_base, hyp_offset; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + switch (type) { + case STACK_TYPE_HYP: + kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); + hyp_base = (unsigned long)stacktrace_info->stack_base; + break; + case STACK_TYPE_OVERFLOW: + kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack); + hyp_base = (unsigned long)stacktrace_info->overflow_stack_base; + break; + default: + return false; + } + + hyp_offset = *addr - hyp_base; + + *addr = kern_base + hyp_offset; + + return true; +} + static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { - return false; + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base; + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static inline bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->stack_base; + unsigned long high = low + PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); } static inline int notrace unwind_next(struct unwind_state *state) { - return 0; + struct stack_info info; + + return unwind_next_common(state, &info, kvm_nvhe_stack_kern_va); } NOKPROBE_SYMBOL(unwind_next); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a0188144a122..6a64293108c5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -49,7 +49,7 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); From 314a61dc31845c233e47c53db3fe6f34284034f4 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:44 -0700 Subject: [PATCH 57/69] KVM: arm64: Introduce hyp_dump_backtrace() In non-protected nVHE mode, unwinds and dumps the hypervisor backtrace from EL1. This is possible beacause the host can directly access the hypervisor stack pages in non-protected mode. The nVHE backtrace is dumped on hyp_panic(), before panicking the host. [ 101.498183] kvm [377]: nVHE call trace: [ 101.498363] kvm [377]: [] __kvm_nvhe_hyp_panic+0xac/0xf8 [ 101.499045] kvm [377]: [] __kvm_nvhe_hyp_panic_bad_stack+0x10/0x10 [ 101.499498] kvm [377]: [] __kvm_nvhe_recursive_death+0x24/0x34 . . . [ 101.524929] kvm [377]: [] __kvm_nvhe_recursive_death+0x24/0x34 [ 101.525062] kvm [377]: [] __kvm_nvhe_recursive_death+0x24/0x34 [ 101.525195] kvm [377]: [] __kvm_nvhe___kvm_vcpu_run+0x30/0x40c [ 101.525333] kvm [377]: [] __kvm_nvhe_handle___kvm_vcpu_run+0x30/0x48 [ 101.525468] kvm [377]: [] __kvm_nvhe_handle_trap+0xc4/0x128 [ 101.525602] kvm [377]: [] __kvm_nvhe___host_exit+0x64/0x64 [ 101.525745] kvm [377]: ---[ end nVHE call trace ]--- Signed-off-by: Kalesh Singh Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-12-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/nvhe.h | 17 ++++++ arch/arm64/kvm/handle_exit.c | 69 ++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 21082fd4a0b7..170fe7459f7c 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -16,6 +16,23 @@ #include +/* + * kvm_nvhe_unwind_init - Start an unwind from the given nVHE HYP fp and pc + * + * @state : unwind_state to initialize + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + */ +static inline void kvm_nvhe_unwind_init(struct unwind_state *state, + unsigned long fp, + unsigned long pc) +{ + unwind_init_common(state, NULL); + + state->fp = fp; + state->pc = pc; +} + static inline bool on_hyp_stack(unsigned long sp, unsigned long size, struct stack_info *info); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index f66c0142b335..e83e6f735100 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -318,6 +319,71 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } +/* + * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry + * + * @arg : the hypervisor offset, used for address translation + * @where : the program counter corresponding to the stack frame + */ +static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long hyp_offset = (unsigned long)arg; + + /* Mask tags and convert to kern addr */ + where = (where & va_mask) + hyp_offset; + kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset())); + + return true; +} + +static inline void kvm_nvhe_dump_backtrace_start(void) +{ + kvm_err("nVHE call trace:\n"); +} + +static inline void kvm_nvhe_dump_backtrace_end(void) +{ + kvm_err("---[ end nVHE call trace ]---\n"); +} + +/* + * hyp_dump_backtrace - Dump the non-protected nVHE backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * The host can directly access HYP stack pages in non-protected + * mode, so the unwinding is done directly from EL1. This removes + * the need for shared buffers between host and hypervisor for + * the stacktrace. + */ +static void hyp_dump_backtrace(unsigned long hyp_offset) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + struct unwind_state state; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc); + + kvm_nvhe_dump_backtrace_start(); + unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset); + kvm_nvhe_dump_backtrace_end(); +} + +/* + * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + */ +static void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) +{ + if (is_protected_kvm_enabled()) + return; + else + hyp_dump_backtrace(hyp_offset); +} + void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, @@ -353,6 +419,9 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, (void *)panic_addr); } + /* Dump the nVHE hypervisor backtrace */ + kvm_nvhe_dump_backtrace(hyp_offset); + /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're From 72adac1bd234002a65cef738e0eebfd6c2ce2e30 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:45 -0700 Subject: [PATCH 58/69] KVM: arm64: Add PROTECTED_NVHE_STACKTRACE Kconfig This can be used to disable stacktrace for the protected KVM nVHE hypervisor, in order to save on the associated memory usage. This option is disabled by default, since protected KVM is not widely used on platforms other than Android currently. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-13-kaleshsingh@google.com --- arch/arm64/kvm/Kconfig | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8a5fbbf084df..09c995869916 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -46,6 +46,21 @@ menuconfig KVM If unsure, say N. +config PROTECTED_NVHE_STACKTRACE + bool "Protected KVM hypervisor stacktraces" + depends on NVHE_EL2_DEBUG + default n + help + Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() + + If you are not using protected nVHE (pKVM), say N. + + If using protected nVHE mode, but cannot afford the associated + memory cost (less than 0.75 page per CPU) of pKVM stacktraces, + say N. + + If unsure, say N. + config NVHE_EL2_DEBUG bool "Debug mode for non-VHE EL2 object" depends on KVM From 6928bcc84bc4bd9a24a1cb1986418c3de76e1d99 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:46 -0700 Subject: [PATCH 59/69] KVM: arm64: Allocate shared pKVM hyp stacktrace buffers In protected nVHE mode the host cannot directly access hypervisor memory, so we will dump the hypervisor stacktrace to a shared buffer with the host. The minimum size for the buffer required, assuming the min frame size of [x29, x30] (2 * sizeof(long)), is half the combined size of the hypervisor and overflow stacks plus an additional entry to delimit the end of the stacktrace. The stacktrace buffers are used later in the series to dump the nVHE hypervisor stacktrace when using protected-mode. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-14-kaleshsingh@google.com --- arch/arm64/include/asm/memory.h | 8 ++++++++ arch/arm64/kvm/hyp/nvhe/stacktrace.c | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0af70d9abede..cab80a9a4086 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -113,6 +113,14 @@ #define OVERFLOW_STACK_SIZE SZ_4K +/* + * With the minimum frame size of [x29, x30], exactly half the combined + * sizes of the hyp and overflow stacks is the maximum size needed to + * save the unwinded stacktrace; plus an additional entry to delimit the + * end. + */ +#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long)) + /* * Alignment of kernel segments (e.g. .text, .data). * diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index b8a280aa026a..e2edda92a108 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -34,6 +34,10 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) stacktrace_info->pc = pc; } +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + /* * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace * From 25aa73b6db1831527cd4f14bf0ddf8dceadec802 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:47 -0700 Subject: [PATCH 60/69] KVM: arm64: Stub implementation of pKVM HYP stack unwinder Add some stub implementations of protected nVHE stack unwinder, for building. These are implemented later in this series. Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-15-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/nvhe.h | 35 ++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 170fe7459f7c..2ce59c058806 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -9,6 +9,10 @@ * to allocate shared buffers for the host to read the unwinded * stacktrace. * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * * Copyright (C) 2022 Google LLC */ #ifndef __ASM_STACKTRACE_NVHE_H @@ -49,7 +53,34 @@ static inline bool on_accessible_stack(const struct task_struct *tsk, return false; } -#ifndef __KVM_NVHE_HYPERVISOR__ +#ifdef __KVM_NVHE_HYPERVISOR__ +/* + * Protected nVHE HYP stack unwinder + * + * In protected mode, the unwinding is done by the hypervisor in EL2. + */ + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +static inline bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + return false; +} + +static inline bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + return false; +} + +static inline int notrace unwind_next(struct unwind_state *state) +{ + return 0; +} +NOKPROBE_SYMBOL(unwind_next); +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +#else /* !__KVM_NVHE_HYPERVISOR__ */ /* * Conventional (non-protected) nVHE HYP stack unwinder * @@ -132,5 +163,5 @@ static inline int notrace unwind_next(struct unwind_state *state) } NOKPROBE_SYMBOL(unwind_next); -#endif /* !__KVM_NVHE_HYPERVISOR__ */ +#endif /* __KVM_NVHE_HYPERVISOR__ */ #endif /* __ASM_STACKTRACE_NVHE_H */ From 871c5d931417d3c0e1aa32c9e04da1dc74703843 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:48 -0700 Subject: [PATCH 61/69] KVM: arm64: Save protected-nVHE (pKVM) hyp stacktrace In protected nVHE mode, the host cannot access private owned hypervisor memory. Also the hypervisor aims to remains simple to reduce the attack surface and does not provide any printk support. For the above reasons, the approach taken to provide hypervisor stacktraces in protected mode is: 1) Unwind and save the hyp stack addresses in EL2 to a shared buffer with the host (done in this patch). 2) Delegate the dumping and symbolization of the addresses to the host in EL1 (later patch in the series). On hyp_panic(), the hypervisor prepares the stacktrace before returning to the host. Signed-off-by: Kalesh Singh Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-16-kaleshsingh@google.com --- arch/arm64/kvm/hyp/nvhe/stacktrace.c | 55 +++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index e2edda92a108..900324b7a08f 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -35,7 +35,60 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) } #ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#include + DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); + +/* + * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry + * + * @arg : index of the entry in the stacktrace buffer + * @where : the program counter corresponding to the stack frame + * + * Save the return address of a stack frame to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static bool pkvm_save_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace); + int size = NVHE_STACKTRACE_SIZE / sizeof(long); + int *idx = (int *)arg; + + /* + * Need 2 free slots: 1 for current entry and 1 for the + * delimiter. + */ + if (*idx > size - 2) + return false; + + stacktrace[*idx] = where; + stacktrace[++*idx] = 0UL; + + return true; +} + +/* + * pkvm_save_backtrace - Saves the protected nVHE HYP stacktrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the unwinded stack addresses to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ + struct unwind_state state; + int idx = 0; + + kvm_nvhe_unwind_init(&state, fp, pc); + + unwind(&state, pkvm_save_backtrace_entry, &idx); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ +} #endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ /* @@ -50,7 +103,7 @@ DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrac void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc) { if (is_protected_kvm_enabled()) - return; + pkvm_save_backtrace(fp, pc); else hyp_prepare_backtrace(fp, pc); } From 75e9459e48d4867caf549e388bd4faabe1dbcbd3 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:49 -0700 Subject: [PATCH 62/69] KVM: arm64: Implement protected nVHE hyp stack unwinder Implements the common framework necessary for unwind() to work in the protected nVHE context: - on_accessible_stack() - on_overflow_stack() - unwind_next() Protected nVHE unwind() is used to unwind and save the hyp stack addresses to the shared stacktrace buffer. The host reads the entries in this buffer, symbolizes and dumps the stacktrace (later patch in the series). Signed-off-by: Kalesh Singh Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-17-kaleshsingh@google.com --- arch/arm64/include/asm/stacktrace/nvhe.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 2ce59c058806..600dbc2220b6 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -64,18 +64,27 @@ static inline bool on_accessible_stack(const struct task_struct *tsk, static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { - return false; + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); } static inline bool on_hyp_stack(unsigned long sp, unsigned long size, struct stack_info *info) { - return false; + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + unsigned long high = params->stack_hyp_va; + unsigned long low = high - PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); } static inline int notrace unwind_next(struct unwind_state *state) { - return 0; + struct stack_info info; + + return unwind_next_common(state, &info, NULL); } NOKPROBE_SYMBOL(unwind_next); #endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ From 3a7e1b55aad45c0cf86bd4e2f212bb9a61905142 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 26 Jul 2022 00:37:50 -0700 Subject: [PATCH 63/69] KVM: arm64: Introduce pkvm_dump_backtrace() Dumps the pKVM hypervisor backtrace from EL1 by reading the unwinded addresses from the shared stacktrace buffer. The nVHE hyp backtrace is dumped on hyp_panic(), before panicking the host. [ 111.623091] kvm [367]: nVHE call trace: [ 111.623215] kvm [367]: [] __kvm_nvhe_hyp_panic+0xac/0xf8 [ 111.623448] kvm [367]: [] __kvm_nvhe_hyp_panic_bad_stack+0x10/0x10 [ 111.623642] kvm [367]: [] __kvm_nvhe_recursive_death+0x24/0x34 . . . [ 111.640366] kvm [367]: [] __kvm_nvhe_recursive_death+0x24/0x34 [ 111.640467] kvm [367]: [] __kvm_nvhe_recursive_death+0x24/0x34 [ 111.640574] kvm [367]: [] __kvm_nvhe___kvm_vcpu_run+0x30/0x40c [ 111.640676] kvm [367]: [] __kvm_nvhe_handle___kvm_vcpu_run+0x30/0x48 [ 111.640778] kvm [367]: [] __kvm_nvhe_handle_trap+0xc4/0x128 [ 111.640880] kvm [367]: [] __kvm_nvhe___host_exit+0x64/0x64 [ 111.640996] kvm [367]: ---[ end nVHE call trace ]--- Signed-off-by: Kalesh Singh Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20220726073750.3219117-18-kaleshsingh@google.com --- arch/arm64/kvm/handle_exit.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index e83e6f735100..c14fc4ba4422 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -371,6 +371,39 @@ static void hyp_dump_backtrace(unsigned long hyp_offset) kvm_nvhe_dump_backtrace_end(); } +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], + pkvm_stacktrace); + +/* + * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * Dumping of the pKVM HYP backtrace is done by reading the + * stack addresses from the shared stacktrace buffer, since the + * host cannot directly access hypervisor memory in protected + * mode. + */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + unsigned long *stacktrace + = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); + int i, size = NVHE_STACKTRACE_SIZE / sizeof(long); + + kvm_nvhe_dump_backtrace_start(); + /* The saved stacktrace is terminated by a null entry */ + for (i = 0; i < size && stacktrace[i]; i++) + kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); + kvm_nvhe_dump_backtrace_end(); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + /* * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. * @@ -379,7 +412,7 @@ static void hyp_dump_backtrace(unsigned long hyp_offset) static void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) { if (is_protected_kvm_enabled()) - return; + pkvm_dump_backtrace(hyp_offset); else hyp_dump_backtrace(hyp_offset); } From 03fe9cd05b9f38353208c23bd791dac47c912054 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 27 Jul 2022 15:29:01 +0100 Subject: [PATCH 64/69] KVM: arm64: Move PROTECTED_NVHE_STACKTRACE around Make the dependency with EL2_DEBUG more obvious by moving the stacktrace configurtion *after* it. Signed-off-by: Marc Zyngier Reviewed-by: Kalesh Singh Tested-by: Kalesh Singh Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20220727142906.1856759-2-maz@kernel.org --- arch/arm64/kvm/Kconfig | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 09c995869916..815cc118c675 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -46,21 +46,6 @@ menuconfig KVM If unsure, say N. -config PROTECTED_NVHE_STACKTRACE - bool "Protected KVM hypervisor stacktraces" - depends on NVHE_EL2_DEBUG - default n - help - Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() - - If you are not using protected nVHE (pKVM), say N. - - If using protected nVHE mode, but cannot afford the associated - memory cost (less than 0.75 page per CPU) of pKVM stacktraces, - say N. - - If unsure, say N. - config NVHE_EL2_DEBUG bool "Debug mode for non-VHE EL2 object" depends on KVM @@ -71,4 +56,17 @@ config NVHE_EL2_DEBUG If unsure, say N. +config PROTECTED_NVHE_STACKTRACE + bool "Protected KVM hypervisor stacktraces" + depends on NVHE_EL2_DEBUG + default n + help + Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() + + If using protected nVHE mode, but cannot afford the associated + memory cost (less than 0.75 page per CPU) of pKVM stacktraces, + say N. + + If unsure, or not using protected nVHE (pKVM), say N. + endif # VIRTUALIZATION From 9f5fee05f6897d0fe0e3a44ade71bb85cd97b2ef Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 27 Jul 2022 15:29:02 +0100 Subject: [PATCH 65/69] KVM: arm64: Move nVHE stacktrace unwinding into its own compilation unit The unwinding code doesn't really belong to the exit handling code. Instead, move it to a file (conveniently named stacktrace.c to confuse the reviewer), and move all the stacktrace-related stuff there. It will be joined by more code very soon. Signed-off-by: Marc Zyngier Reviewed-by: Kalesh Singh Tested-by: Kalesh Singh Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20220727142906.1856759-3-maz@kernel.org --- arch/arm64/include/asm/stacktrace/nvhe.h | 2 + arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/handle_exit.c | 98 ------------------ arch/arm64/kvm/stacktrace.c | 120 +++++++++++++++++++++++ 4 files changed, 123 insertions(+), 99 deletions(-) create mode 100644 arch/arm64/kvm/stacktrace.c diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 600dbc2220b6..8a5cb96d7143 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -172,5 +172,7 @@ static inline int notrace unwind_next(struct unwind_state *state) } NOKPROBE_SYMBOL(unwind_next); +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); + #endif /* __KVM_NVHE_HYPERVISOR__ */ #endif /* __ASM_STACKTRACE_NVHE_H */ diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index aa127ae9f675..5e33c2d4645a 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ - guest.o debug.o reset.o sys_regs.o \ + guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ arch_timer.o trng.o vmid.o \ vgic/vgic.o vgic/vgic-init.o \ diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index c14fc4ba4422..ef8b57953aa2 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -319,104 +319,6 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } -/* - * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry - * - * @arg : the hypervisor offset, used for address translation - * @where : the program counter corresponding to the stack frame - */ -static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) -{ - unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); - unsigned long hyp_offset = (unsigned long)arg; - - /* Mask tags and convert to kern addr */ - where = (where & va_mask) + hyp_offset; - kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset())); - - return true; -} - -static inline void kvm_nvhe_dump_backtrace_start(void) -{ - kvm_err("nVHE call trace:\n"); -} - -static inline void kvm_nvhe_dump_backtrace_end(void) -{ - kvm_err("---[ end nVHE call trace ]---\n"); -} - -/* - * hyp_dump_backtrace - Dump the non-protected nVHE backtrace. - * - * @hyp_offset: hypervisor offset, used for address translation. - * - * The host can directly access HYP stack pages in non-protected - * mode, so the unwinding is done directly from EL1. This removes - * the need for shared buffers between host and hypervisor for - * the stacktrace. - */ -static void hyp_dump_backtrace(unsigned long hyp_offset) -{ - struct kvm_nvhe_stacktrace_info *stacktrace_info; - struct unwind_state state; - - stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); - - kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc); - - kvm_nvhe_dump_backtrace_start(); - unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset); - kvm_nvhe_dump_backtrace_end(); -} - -#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE -DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], - pkvm_stacktrace); - -/* - * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace. - * - * @hyp_offset: hypervisor offset, used for address translation. - * - * Dumping of the pKVM HYP backtrace is done by reading the - * stack addresses from the shared stacktrace buffer, since the - * host cannot directly access hypervisor memory in protected - * mode. - */ -static void pkvm_dump_backtrace(unsigned long hyp_offset) -{ - unsigned long *stacktrace - = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); - int i, size = NVHE_STACKTRACE_SIZE / sizeof(long); - - kvm_nvhe_dump_backtrace_start(); - /* The saved stacktrace is terminated by a null entry */ - for (i = 0; i < size && stacktrace[i]; i++) - kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); - kvm_nvhe_dump_backtrace_end(); -} -#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ -static void pkvm_dump_backtrace(unsigned long hyp_offset) -{ - kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); -} -#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ - -/* - * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. - * - * @hyp_offset: hypervisor offset, used for address translation. - */ -static void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) -{ - if (is_protected_kvm_enabled()) - pkvm_dump_backtrace(hyp_offset); - else - hyp_dump_backtrace(hyp_offset); -} - void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c new file mode 100644 index 000000000000..9812aefdcfb4 --- /dev/null +++ b/arch/arm64/kvm/stacktrace.c @@ -0,0 +1,120 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ + +#include +#include + +#include + +/* + * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry + * + * @arg : the hypervisor offset, used for address translation + * @where : the program counter corresponding to the stack frame + */ +static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long hyp_offset = (unsigned long)arg; + + /* Mask tags and convert to kern addr */ + where = (where & va_mask) + hyp_offset; + kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset())); + + return true; +} + +static void kvm_nvhe_dump_backtrace_start(void) +{ + kvm_err("nVHE call trace:\n"); +} + +static void kvm_nvhe_dump_backtrace_end(void) +{ + kvm_err("---[ end nVHE call trace ]---\n"); +} + +/* + * hyp_dump_backtrace - Dump the non-protected nVHE backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * The host can directly access HYP stack pages in non-protected + * mode, so the unwinding is done directly from EL1. This removes + * the need for shared buffers between host and hypervisor for + * the stacktrace. + */ +static void hyp_dump_backtrace(unsigned long hyp_offset) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + struct unwind_state state; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc); + + kvm_nvhe_dump_backtrace_start(); + unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset); + kvm_nvhe_dump_backtrace_end(); +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], + pkvm_stacktrace); + +/* + * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * Dumping of the pKVM HYP backtrace is done by reading the + * stack addresses from the shared stacktrace buffer, since the + * host cannot directly access hypervisor memory in protected + * mode. + */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + unsigned long *stacktrace + = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); + int i, size = NVHE_STACKTRACE_SIZE / sizeof(long); + + kvm_nvhe_dump_backtrace_start(); + /* The saved stacktrace is terminated by a null entry */ + for (i = 0; i < size && stacktrace[i]; i++) + kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); + kvm_nvhe_dump_backtrace_end(); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + */ +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) +{ + if (is_protected_kvm_enabled()) + pkvm_dump_backtrace(hyp_offset); + else + hyp_dump_backtrace(hyp_offset); +} From 4e00532f37365967e9896966b1fe61888e659259 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 27 Jul 2022 15:29:03 +0100 Subject: [PATCH 66/69] KVM: arm64: Make unwind()/on_accessible_stack() per-unwinder functions Having multiple versions of on_accessible_stack() (one per unwinder) makes it very hard to reason about what is used where due to the complexity of the various includes, the forward declarations, and the reliance on everything being 'inline'. Instead, move the code back where it should be. Each unwinder implements: - on_accessible_stack() as well as the helpers it depends on, - unwind()/unwind_next(), as they pass on_accessible_stack as a parameter to unwind_next_common() (which is the only common code here) This hardly results in any duplication, and makes it much easier to reason about the code. Signed-off-by: Marc Zyngier Reviewed-by: Kalesh Singh Tested-by: Kalesh Singh Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20220727142906.1856759-4-maz@kernel.org --- arch/arm64/include/asm/stacktrace.h | 74 ------------------ arch/arm64/include/asm/stacktrace/common.h | 55 ++++--------- arch/arm64/include/asm/stacktrace/nvhe.h | 84 +------------------- arch/arm64/kernel/stacktrace.c | 90 ++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/stacktrace.c | 52 +++++++++++++ arch/arm64/kvm/stacktrace.c | 55 +++++++++++++ 6 files changed, 213 insertions(+), 197 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index ea828579a98b..6ebdcdff77f5 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -57,78 +57,4 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { return false; } #endif - -/* - * We can only safely access per-cpu stacks from current in a non-preemptible - * context. - */ -static inline bool on_accessible_stack(const struct task_struct *tsk, - unsigned long sp, unsigned long size, - struct stack_info *info) -{ - if (on_accessible_stack_common(tsk, sp, size, info)) - return true; - - if (on_task_stack(tsk, sp, size, info)) - return true; - if (tsk != current || preemptible()) - return false; - if (on_irq_stack(sp, size, info)) - return true; - if (on_sdei_stack(sp, size, info)) - return true; - - return false; -} - -/* - * Unwind from one frame record (A) to the next frame record (B). - * - * We terminate early if the location of B indicates a malformed chain of frame - * records (e.g. a cycle), determined based on the location and fp value of A - * and the location (but not the fp value) of B. - */ -static inline int notrace unwind_next(struct unwind_state *state) -{ - struct task_struct *tsk = state->task; - unsigned long fp = state->fp; - struct stack_info info; - int err; - - /* Final frame; nothing to unwind */ - if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) - return -ENOENT; - - err = unwind_next_common(state, &info, NULL); - if (err) - return err; - - state->pc = ptrauth_strip_insn_pac(state->pc); - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - if (tsk->ret_stack && - (state->pc == (unsigned long)return_to_handler)) { - unsigned long orig_pc; - /* - * This is a case where function graph tracer has - * modified a return address (LR) in a stack frame - * to hook a function return. - * So replace it to an original value. - */ - orig_pc = ftrace_graph_ret_addr(tsk, NULL, state->pc, - (void *)state->fp); - if (WARN_ON_ONCE(state->pc == orig_pc)) - return -EINVAL; - state->pc = orig_pc; - } -#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -#ifdef CONFIG_KRETPROBES - if (is_kretprobe_trampoline(state->pc)) - state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur); -#endif - - return 0; -} -NOKPROBE_SYMBOL(unwind_next); - #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 3ebb69ea374a..18046a7248a2 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -79,15 +79,6 @@ struct unwind_state { struct task_struct *task; }; -static inline bool on_overflow_stack(unsigned long sp, unsigned long size, - struct stack_info *info); - -static inline bool on_accessible_stack(const struct task_struct *tsk, - unsigned long sp, unsigned long size, - struct stack_info *info); - -static inline int unwind_next(struct unwind_state *state); - static inline bool on_stack(unsigned long sp, unsigned long size, unsigned long low, unsigned long high, enum stack_type type, struct stack_info *info) @@ -106,21 +97,6 @@ static inline bool on_stack(unsigned long sp, unsigned long size, return true; } -static inline bool on_accessible_stack_common(const struct task_struct *tsk, - unsigned long sp, - unsigned long size, - struct stack_info *info) -{ - if (info) - info->type = STACK_TYPE_UNKNOWN; - - /* - * Both the kernel and nvhe hypervisor make use of - * an overflow_stack - */ - return on_overflow_stack(sp, size, info); -} - static inline void unwind_init_common(struct unwind_state *state, struct task_struct *task) { @@ -156,8 +132,22 @@ static inline void unwind_init_common(struct unwind_state *state, typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp, enum stack_type type); +/* + * on_accessible_stack_fn() - Check whether a stack range is on any + * of the possible stacks. + * + * @tsk: task whose stack is being unwound + * @sp: stack address being checked + * @size: size of the stack range being checked + * @info: stack unwinding context + */ +typedef bool (*on_accessible_stack_fn)(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info); + static inline int unwind_next_common(struct unwind_state *state, struct stack_info *info, + on_accessible_stack_fn accessible, stack_trace_translate_fp_fn translate_fp) { unsigned long fp = state->fp, kern_fp = fp; @@ -166,7 +156,7 @@ static inline int unwind_next_common(struct unwind_state *state, if (fp & 0x7) return -EINVAL; - if (!on_accessible_stack(tsk, fp, 16, info)) + if (!accessible(tsk, fp, 16, info)) return -EINVAL; if (test_bit(info->type, state->stacks_done)) @@ -212,19 +202,4 @@ static inline int unwind_next_common(struct unwind_state *state, return 0; } -static inline void notrace unwind(struct unwind_state *state, - stack_trace_consume_fn consume_entry, - void *cookie) -{ - while (1) { - int ret; - - if (!consume_entry(cookie, state->pc)) - break; - ret = unwind_next(state); - if (ret < 0) - break; - } -} -NOKPROBE_SYMBOL(unwind); #endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 8a5cb96d7143..a096216d8970 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -37,59 +37,7 @@ static inline void kvm_nvhe_unwind_init(struct unwind_state *state, state->pc = pc; } -static inline bool on_hyp_stack(unsigned long sp, unsigned long size, - struct stack_info *info); - -static inline bool on_accessible_stack(const struct task_struct *tsk, - unsigned long sp, unsigned long size, - struct stack_info *info) -{ - if (on_accessible_stack_common(tsk, sp, size, info)) - return true; - - if (on_hyp_stack(sp, size, info)) - return true; - - return false; -} - -#ifdef __KVM_NVHE_HYPERVISOR__ -/* - * Protected nVHE HYP stack unwinder - * - * In protected mode, the unwinding is done by the hypervisor in EL2. - */ - -#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE -static inline bool on_overflow_stack(unsigned long sp, unsigned long size, - struct stack_info *info) -{ - unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); - unsigned long high = low + OVERFLOW_STACK_SIZE; - - return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); -} - -static inline bool on_hyp_stack(unsigned long sp, unsigned long size, - struct stack_info *info) -{ - struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); - unsigned long high = params->stack_hyp_va; - unsigned long low = high - PAGE_SIZE; - - return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); -} - -static inline int notrace unwind_next(struct unwind_state *state) -{ - struct stack_info info; - - return unwind_next_common(state, &info, NULL); -} -NOKPROBE_SYMBOL(unwind_next); -#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ - -#else /* !__KVM_NVHE_HYPERVISOR__ */ +#ifndef __KVM_NVHE_HYPERVISOR__ /* * Conventional (non-protected) nVHE HYP stack unwinder * @@ -142,36 +90,6 @@ static inline bool kvm_nvhe_stack_kern_va(unsigned long *addr, return true; } -static inline bool on_overflow_stack(unsigned long sp, unsigned long size, - struct stack_info *info) -{ - struct kvm_nvhe_stacktrace_info *stacktrace_info - = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); - unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base; - unsigned long high = low + OVERFLOW_STACK_SIZE; - - return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); -} - -static inline bool on_hyp_stack(unsigned long sp, unsigned long size, - struct stack_info *info) -{ - struct kvm_nvhe_stacktrace_info *stacktrace_info - = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); - unsigned long low = (unsigned long)stacktrace_info->stack_base; - unsigned long high = low + PAGE_SIZE; - - return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); -} - -static inline int notrace unwind_next(struct unwind_state *state) -{ - struct stack_info info; - - return unwind_next_common(state, &info, kvm_nvhe_stack_kern_va); -} -NOKPROBE_SYMBOL(unwind_next); - void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); #endif /* __KVM_NVHE_HYPERVISOR__ */ diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 9fa60ee48499..ce190ee18a20 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -67,6 +67,96 @@ static inline void unwind_init_from_task(struct unwind_state *state, state->pc = thread_saved_pc(task); } +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + if (on_task_stack(tsk, sp, size, info)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp, size, info)) + return true; + if (on_overflow_stack(sp, size, info)) + return true; + if (on_sdei_stack(sp, size, info)) + return true; + + return false; +} + +/* + * Unwind from one frame record (A) to the next frame record (B). + * + * We terminate early if the location of B indicates a malformed chain of frame + * records (e.g. a cycle), determined based on the location and fp value of A + * and the location (but not the fp value) of B. + */ +static int notrace unwind_next(struct unwind_state *state) +{ + struct task_struct *tsk = state->task; + unsigned long fp = state->fp; + struct stack_info info; + int err; + + /* Final frame; nothing to unwind */ + if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) + return -ENOENT; + + err = unwind_next_common(state, &info, on_accessible_stack, NULL); + if (err) + return err; + + state->pc = ptrauth_strip_insn_pac(state->pc); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (tsk->ret_stack && + (state->pc == (unsigned long)return_to_handler)) { + unsigned long orig_pc; + /* + * This is a case where function graph tracer has + * modified a return address (LR) in a stack frame + * to hook a function return. + * So replace it to an original value. + */ + orig_pc = ftrace_graph_ret_addr(tsk, NULL, state->pc, + (void *)state->fp); + if (WARN_ON_ONCE(state->pc == orig_pc)) + return -EINVAL; + state->pc = orig_pc; + } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +#ifdef CONFIG_KRETPROBES + if (is_kretprobe_trampoline(state->pc)) + state->pc = kretprobe_find_ret_addr(tsk, (void *)state->fp, &state->kr_cur); +#endif + + return 0; +} +NOKPROBE_SYMBOL(unwind_next); + +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} +NOKPROBE_SYMBOL(unwind); + static bool dump_backtrace_entry(void *arg, unsigned long where) { char *loglvl = arg; diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index 900324b7a08f..acbe272ecb32 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -39,6 +39,58 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + unsigned long high = params->stack_hyp_va; + unsigned long low = high - PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, NULL); +} + +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, + void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + /* * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry * diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index 9812aefdcfb4..4d5fec3175ff 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -21,6 +21,61 @@ #include +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base; + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->stack_base; + unsigned long high = low + PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, + kvm_nvhe_stack_kern_va); +} + +static void unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + /* * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry * From 0e773da1e688a1425ef7deae58fa11c5c7e09533 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 27 Jul 2022 15:29:04 +0100 Subject: [PATCH 67/69] KVM: arm64: Move nVHE-only helpers into kvm/stacktrace.c kvm_nvhe_stack_kern_va() only makes sense as part of the nVHE unwinder, so simply move it there. Signed-off-by: Marc Zyngier Reviewed-by: Kalesh Singh Tested-by: Kalesh Singh Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20220727142906.1856759-5-maz@kernel.org --- arch/arm64/include/asm/stacktrace/nvhe.h | 41 ------------------------ arch/arm64/kvm/stacktrace.c | 41 ++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index a096216d8970..d5527b600390 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -49,47 +49,6 @@ DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overf DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -/* - * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs - * - * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to - * allow for guard pages below the stack. Consequently, the fixed offset address - * translation macros won't work here. - * - * The kernel VA is calculated as an offset from the kernel VA of the hypervisor - * stack base. - * - * Returns true on success and updates @addr to its corresponding kernel VA; - * otherwise returns false. - */ -static inline bool kvm_nvhe_stack_kern_va(unsigned long *addr, - enum stack_type type) -{ - struct kvm_nvhe_stacktrace_info *stacktrace_info; - unsigned long hyp_base, kern_base, hyp_offset; - - stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); - - switch (type) { - case STACK_TYPE_HYP: - kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); - hyp_base = (unsigned long)stacktrace_info->stack_base; - break; - case STACK_TYPE_OVERFLOW: - kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack); - hyp_base = (unsigned long)stacktrace_info->overflow_stack_base; - break; - default: - return false; - } - - hyp_offset = *addr - hyp_base; - - *addr = kern_base + hyp_offset; - - return true; -} - void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); #endif /* __KVM_NVHE_HYPERVISOR__ */ diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index 4d5fec3175ff..417665854f86 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -21,6 +21,47 @@ #include +/* + * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs + * + * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to + * allow for guard pages below the stack. Consequently, the fixed offset address + * translation macros won't work here. + * + * The kernel VA is calculated as an offset from the kernel VA of the hypervisor + * stack base. + * + * Returns true on success and updates @addr to its corresponding kernel VA; + * otherwise returns false. + */ +static bool kvm_nvhe_stack_kern_va(unsigned long *addr, + enum stack_type type) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + unsigned long hyp_base, kern_base, hyp_offset; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + switch (type) { + case STACK_TYPE_HYP: + kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); + hyp_base = (unsigned long)stacktrace_info->stack_base; + break; + case STACK_TYPE_OVERFLOW: + kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack); + hyp_base = (unsigned long)stacktrace_info->overflow_stack_base; + break; + default: + return false; + } + + hyp_offset = *addr - hyp_base; + + *addr = kern_base + hyp_offset; + + return true; +} + static bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { From 62ae21627aa96f6ef361981dd181c74dc7aa314c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 27 Jul 2022 15:29:05 +0100 Subject: [PATCH 68/69] KVM: arm64: Don't open code ARRAY_SIZE() Use ARRAY_SIZE() instead of an open-coded version. Signed-off-by: Oliver Upton Signed-off-by: Marc Zyngier Reviewed-by: Kalesh Singh Tested-by: Kalesh Singh Link: https://lore.kernel.org/r/20220727142906.1856759-6-maz@kernel.org --- arch/arm64/kvm/hyp/nvhe/stacktrace.c | 3 +-- arch/arm64/kvm/stacktrace.c | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index acbe272ecb32..58f645ad66bc 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -103,14 +103,13 @@ static void notrace unwind(struct unwind_state *state, static bool pkvm_save_backtrace_entry(void *arg, unsigned long where) { unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace); - int size = NVHE_STACKTRACE_SIZE / sizeof(long); int *idx = (int *)arg; /* * Need 2 free slots: 1 for current entry and 1 for the * delimiter. */ - if (*idx > size - 2) + if (*idx > ARRAY_SIZE(pkvm_stacktrace) - 2) return false; stacktrace[*idx] = where; diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index 417665854f86..949d19d603fb 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -187,11 +187,13 @@ static void pkvm_dump_backtrace(unsigned long hyp_offset) { unsigned long *stacktrace = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); - int i, size = NVHE_STACKTRACE_SIZE / sizeof(long); + int i; kvm_nvhe_dump_backtrace_start(); /* The saved stacktrace is terminated by a null entry */ - for (i = 0; i < size && stacktrace[i]; i++) + for (i = 0; + i < ARRAY_SIZE(kvm_nvhe_sym(pkvm_stacktrace)) && stacktrace[i]; + i++) kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); kvm_nvhe_dump_backtrace_end(); } From a4c750e2328a117dc9b19a2a61db0d4347902029 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 27 Jul 2022 15:29:06 +0100 Subject: [PATCH 69/69] arm64: Update 'unwinder howto' Implementing a new unwinder is a bit more involved than writing a couple of helpers, so let's not lure the reader into a false sense of comfort. Instead, let's point out what they should call into, and what sort of parameter they need to provide. Signed-off-by: Marc Zyngier Reviewed-by: Kalesh Singh Tested-by: Kalesh Singh Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20220727142906.1856759-7-maz@kernel.org --- arch/arm64/include/asm/stacktrace/common.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h index 18046a7248a2..f58eb944c46f 100644 --- a/arch/arm64/include/asm/stacktrace/common.h +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -5,17 +5,11 @@ * To implement a new arm64 stack unwinder: * 1) Include this header * - * 2) Provide implementations for the following functions: - * on_overflow_stack(): Returns true if SP is on the overflow - * stack. - * on_accessible_stack(): Returns true is SP is on any accessible - * stack. - * unwind_next(): Performs validation checks on the frame - * pointer, and transitions unwind_state - * to the next frame. + * 2) Call into unwind_next_common() from your top level unwind + * function, passing it the validation and translation callbacks + * (though the later can be NULL if no translation is required). * - * See: arch/arm64/include/asm/stacktrace.h for reference - * implementations. + * See: arch/arm64/kernel/stacktrace.c for the reference implementation. * * Copyright (C) 2012 ARM Ltd. */