linux-stable/arch/riscv/include/asm/cpufeature.h

141 lines
3.2 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2022-2023 Rivos, Inc
*/
#ifndef _ASM_CPUFEATURE_H
#define _ASM_CPUFEATURE_H
#include <linux/bitmap.h>
#include <linux/jump_label.h>
#include <asm/hwcap.h>
#include <asm/alternative-macros.h>
#include <asm/errno.h>
/*
* These are probed via a device_initcall(), via either the SBI or directly
* from the corresponding CSRs.
*/
struct riscv_cpuinfo {
unsigned long mvendorid;
unsigned long marchid;
unsigned long mimpid;
};
struct riscv_isainfo {
DECLARE_BITMAP(isa, RISCV_ISA_EXT_MAX);
};
DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo);
RISC-V: hwprobe: Support probing of misaligned access performance This allows userspace to select various routines to use based on the performance of misaligned access on the target hardware. Rather than adding DT bindings, this change taps into the alternatives mechanism used to probe CPU errata. Add a new function pointer alongside the vendor-specific errata_patch_func() that probes for desirable errata (otherwise known as "features"). Unlike the errata_patch_func(), this function is called on each CPU as it comes up, so it can save feature information per-CPU. The T-head C906 has fast unaligned access, both as defined by GCC [1], and in performing a basic benchmark, which determined that byte copies are >50% slower than a misaligned word copy of the same data size (source for this test at [2]): bytecopy size f000 count 50000 offset 0 took 31664899 us wordcopy size f000 count 50000 offset 0 took 5180919 us wordcopy size f000 count 50000 offset 1 took 13416949 us [1] https://github.com/gcc-mirror/gcc/blob/master/gcc/config/riscv/riscv.cc#L353 [2] https://pastebin.com/EPXvDHSW Co-developed-by: Palmer Dabbelt <palmer@rivosinc.com> Signed-off-by: Evan Green <evan@rivosinc.com> Reviewed-by: Heiko Stuebner <heiko.stuebner@vrull.eu> Tested-by: Heiko Stuebner <heiko.stuebner@vrull.eu> Reviewed-by: Conor Dooley <conor.dooley@microchip.com> Reviewed-by: Paul Walmsley <paul.walmsley@sifive.com> Link: https://lore.kernel.org/r/20230407231103.2622178-5-evan@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2023-04-07 23:11:01 +00:00
DECLARE_PER_CPU(long, misaligned_access_speed);
/* Per-cpu ISA extensions. */
extern struct riscv_isainfo hart_isa[NR_CPUS];
void riscv_user_isa_enable(void);
RISC-V: Probe for unaligned access speed Rather than deferring unaligned access speed determinations to a vendor function, let's probe them and find out how fast they are. If we determine that an unaligned word access is faster than N byte accesses, mark the hardware's unaligned access as "fast". Otherwise, we mark accesses as slow. The algorithm itself runs for a fixed amount of jiffies. Within each iteration it attempts to time a single loop, and then keeps only the best (fastest) loop it saw. This algorithm was found to have lower variance from run to run than my first attempt, which counted the total number of iterations that could be done in that fixed amount of jiffies. By taking only the best iteration in the loop, assuming at least one loop wasn't perturbed by an interrupt, we eliminate the effects of interrupts and other "warm up" factors like branch prediction. The only downside is it depends on having an rdtime granular and accurate enough to measure a single copy. If we ever manage to complete a loop in 0 rdtime ticks, we leave the unaligned setting at UNKNOWN. There is a slight change in user-visible behavior here. Previously, all boards except the THead C906 reported misaligned access speed of UNKNOWN. C906 reported FAST. With this change, since we're now measuring misaligned access speed on each hart, all RISC-V systems will have this key set as either FAST or SLOW. Currently, we don't have a way to confidently measure the difference between SLOW and EMULATED, so we label anything not fast as SLOW. This will mislabel some systems that are actually EMULATED as SLOW. When we get support for delegating misaligned access traps to the kernel (as opposed to the firmware quietly handling it), we can explicitly test in Linux to see if unaligned accesses trap. Those systems will start to report EMULATED, though older (today's) systems without that new SBI mechanism will continue to report SLOW. I've updated the documentation for those hwprobe values to reflect this, specifically: SLOW may or may not be emulated by software, and FAST represents means being faster than equivalent byte accesses. The change in documentation is accurate with respect to both the former and current behavior. Signed-off-by: Evan Green <evan@rivosinc.com> Acked-by: Conor Dooley <conor.dooley@microchip.com> Link: https://lore.kernel.org/r/20230818194136.4084400-2-evan@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
2023-08-18 19:41:35 +00:00
#ifdef CONFIG_RISCV_MISALIGNED
bool unaligned_ctl_available(void);
bool check_unaligned_access_emulated(int cpu);
void unaligned_emulation_finish(void);
#else
static inline bool unaligned_ctl_available(void)
{
return false;
}
static inline bool check_unaligned_access_emulated(int cpu)
{
return false;
}
static inline void unaligned_emulation_finish(void) {}
#endif
unsigned long riscv_get_elf_hwcap(void);
struct riscv_isa_ext_data {
const unsigned int id;
const char *name;
const char *property;
const unsigned int *subset_ext_ids;
const unsigned int subset_ext_size;
};
extern const struct riscv_isa_ext_data riscv_isa_ext[];
extern const size_t riscv_isa_ext_count;
extern bool riscv_isa_fallback;
unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap);
bool __riscv_isa_extension_available(const unsigned long *isa_bitmap, unsigned int bit);
#define riscv_isa_extension_available(isa_bitmap, ext) \
__riscv_isa_extension_available(isa_bitmap, RISCV_ISA_EXT_##ext)
static __always_inline bool
riscv_has_extension_likely(const unsigned long ext)
{
compiletime_assert(ext < RISCV_ISA_EXT_MAX,
"ext must be < RISCV_ISA_EXT_MAX");
if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
work around gcc bugs with 'asm goto' with outputs We've had issues with gcc and 'asm goto' before, and we created a 'asm_volatile_goto()' macro for that in the past: see commits 3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation bug") and a9f180345f53 ("compiler/gcc4: Make quirk for asm_volatile_goto() unconditional"). Then, much later, we ended up removing the workaround in commit 43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR 58670") because we no longer supported building the kernel with the affected gcc versions, but we left the macro uses around. Now, Sean Christopherson reports a new version of a very similar problem, which is fixed by re-applying that ancient workaround. But the problem in question is limited to only the 'asm goto with outputs' cases, so instead of re-introducing the old workaround as-is, let's rename and limit the workaround to just that much less common case. It looks like there are at least two separate issues that all hit in this area: (a) some versions of gcc don't mark the asm goto as 'volatile' when it has outputs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 which is easy to work around by just adding the 'volatile' by hand. (b) Internal compiler errors: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 which are worked around by adding the extra empty 'asm' as a barrier, as in the original workaround. but the problem Sean sees may be a third thing since it involves bad code generation (not an ICE) even with the manually added 'volatile'. but the same old workaround works for this case, even if this feels a bit like voodoo programming and may only be hiding the issue. Reported-and-tested-by: Sean Christopherson <seanjc@google.com> Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/ Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Uros Bizjak <ubizjak@gmail.com> Cc: Jakub Jelinek <jakub@redhat.com> Cc: Andrew Pinski <quic_apinski@quicinc.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-02-09 20:39:31 +00:00
asm goto(
ALTERNATIVE("j %l[l_no]", "nop", 0, %[ext], 1)
:
: [ext] "i" (ext)
:
: l_no);
} else {
if (!__riscv_isa_extension_available(NULL, ext))
goto l_no;
}
return true;
l_no:
return false;
}
static __always_inline bool
riscv_has_extension_unlikely(const unsigned long ext)
{
compiletime_assert(ext < RISCV_ISA_EXT_MAX,
"ext must be < RISCV_ISA_EXT_MAX");
if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE)) {
work around gcc bugs with 'asm goto' with outputs We've had issues with gcc and 'asm goto' before, and we created a 'asm_volatile_goto()' macro for that in the past: see commits 3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation bug") and a9f180345f53 ("compiler/gcc4: Make quirk for asm_volatile_goto() unconditional"). Then, much later, we ended up removing the workaround in commit 43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR 58670") because we no longer supported building the kernel with the affected gcc versions, but we left the macro uses around. Now, Sean Christopherson reports a new version of a very similar problem, which is fixed by re-applying that ancient workaround. But the problem in question is limited to only the 'asm goto with outputs' cases, so instead of re-introducing the old workaround as-is, let's rename and limit the workaround to just that much less common case. It looks like there are at least two separate issues that all hit in this area: (a) some versions of gcc don't mark the asm goto as 'volatile' when it has outputs: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420 which is easy to work around by just adding the 'volatile' by hand. (b) Internal compiler errors: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422 which are worked around by adding the extra empty 'asm' as a barrier, as in the original workaround. but the problem Sean sees may be a third thing since it involves bad code generation (not an ICE) even with the manually added 'volatile'. but the same old workaround works for this case, even if this feels a bit like voodoo programming and may only be hiding the issue. Reported-and-tested-by: Sean Christopherson <seanjc@google.com> Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/ Cc: Nick Desaulniers <ndesaulniers@google.com> Cc: Uros Bizjak <ubizjak@gmail.com> Cc: Jakub Jelinek <jakub@redhat.com> Cc: Andrew Pinski <quic_apinski@quicinc.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-02-09 20:39:31 +00:00
asm goto(
ALTERNATIVE("nop", "j %l[l_yes]", 0, %[ext], 1)
:
: [ext] "i" (ext)
:
: l_yes);
} else {
if (__riscv_isa_extension_available(NULL, ext))
goto l_yes;
}
return false;
l_yes:
return true;
}
static __always_inline bool riscv_cpu_has_extension_likely(int cpu, const unsigned long ext)
{
if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && riscv_has_extension_likely(ext))
return true;
return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
}
static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsigned long ext)
{
if (IS_ENABLED(CONFIG_RISCV_ALTERNATIVE) && riscv_has_extension_unlikely(ext))
return true;
return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
}
DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
#endif