From 471aba2e4760f9abcec7d872a85795c6dd60dce1 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Wed, 28 Jun 2023 05:12:13 -0400 Subject: [PATCH 01/51] riscv: sigcontext: Correct the comment of sigreturn The real-time signals enlarged the sigset_t type, and most architectures have changed to using rt_sigreturn as the only way. The riscv is one of them, and there is no sys_sigreturn in it. Only some old architecture preserved sys_sigreturn as part of the historical burden. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20230628091213.2908149-1-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/sigcontext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/uapi/asm/sigcontext.h b/arch/riscv/include/uapi/asm/sigcontext.h index 8c8712aa9551..cd4f175dc837 100644 --- a/arch/riscv/include/uapi/asm/sigcontext.h +++ b/arch/riscv/include/uapi/asm/sigcontext.h @@ -25,7 +25,7 @@ struct __sc_riscv_v_state { * Signal context structure * * This contains the context saved before a signal handler is invoked; - * it is restored by sys_sigreturn / sys_rt_sigreturn. + * it is restored by sys_rt_sigreturn. */ struct sigcontext { struct user_regs_struct sc_regs; From 23059893967826467e36c6d9f7428cc79dd48a11 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Thu, 13 Jul 2023 13:10:59 +0100 Subject: [PATCH 02/51] RISC-V: Provide a more helpful error message on invalid ISA strings Right now we provide a somewhat unhelpful error message on systems with invalid error messages, something along the lines of CPU with hartid=0 is not available ------------[ cut here ]------------ kernel BUG at arch/riscv/kernel/smpboot.c:174! Kernel BUG [#1] Modules linked in: CPU: 0 PID: 0 Comm: swapper Not tainted 6.4.0-rc1-00096-ge0097d2c62d5-dirty #1 Hardware name: Microchip PolarFire-SoC Icicle Kit (DT) epc : of_parse_and_init_cpus+0x16c/0x16e ra : of_parse_and_init_cpus+0x9a/0x16e epc : ffffffff80c04e0a ra : ffffffff80c04d38 sp : ffffffff81603e20 gp : ffffffff8182d658 tp : ffffffff81613f80 t0 : 000000000000006e t1 : 0000000000000064 t2 : 0000000000000000 s0 : ffffffff81603e80 s1 : 0000000000000000 a0 : 0000000000000000 a1 : 0000000000000000 a2 : 0000000000000000 a3 : 0000000000000000 a4 : 0000000000000000 a5 : 0000000000001fff a6 : 0000000000001fff a7 : ffffffff816148b0 s2 : 0000000000000001 s3 : ffffffff81492a4c s4 : ffffffff81a4b090 s5 : ffffffff81506030 s6 : 0000000000000040 s7 : 0000000000000000 s8 : 00000000bfb6f046 s9 : 0000000000000001 s10: 0000000000000000 s11: 00000000bf389700 t3 : 0000000000000000 t4 : 0000000000000000 t5 : ffffffff824dd188 t6 : ffffffff824dd187 status: 0000000200000100 badaddr: 0000000000000000 cause: 0000000000000003 [] of_parse_and_init_cpus+0x16c/0x16e [] setup_smp+0x1e/0x26 [] setup_arch+0x6e/0xb2 [] start_kernel+0x72/0x400 Code: 80e7 4a00 a603 0009 b795 1097 ffe5 80e7 92c0 9002 (9002) 715d ---[ end trace 0000000000000000 ]--- Kernel panic - not syncing: Fatal exception in interrupt Add a warning for the cases where the ISA string isn't valid. It's still above the BUG_ON cut, but hopefully it's at least a bit easier for users. Reviewed-by: Evan Green Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-endless-spearhead-62a5a4b149bd@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index a2fc952318e9..3af2d214ce21 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -66,11 +66,15 @@ int riscv_early_of_processor_hartid(struct device_node *node, unsigned long *har return -ENODEV; } - if (IS_ENABLED(CONFIG_32BIT) && strncasecmp(isa, "rv32ima", 7)) + if (IS_ENABLED(CONFIG_32BIT) && strncasecmp(isa, "rv32ima", 7)) { + pr_warn("CPU with hartid=%lu does not support rv32ima", *hart); return -ENODEV; + } - if (IS_ENABLED(CONFIG_64BIT) && strncasecmp(isa, "rv64ima", 7)) + if (IS_ENABLED(CONFIG_64BIT) && strncasecmp(isa, "rv64ima", 7)) { + pr_warn("CPU with hartid=%lu does not support rv64ima", *hart); return -ENODEV; + } return 0; } From 67270fb388feb3ab7b8f1c8e2825d43237ffa7a0 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Thu, 13 Jul 2023 13:11:00 +0100 Subject: [PATCH 03/51] RISC-V: don't parse dt/acpi isa string to get rv32/rv64 When filling hwcap the kernel already expects the isa string to start with rv32 if CONFIG_32BIT and rv64 if CONFIG_64BIT. So when recreating the runtime isa-string we can also just go the other way to get the correct starting point for it. Signed-off-by: Heiko Stuebner Reviewed-by: Andrew Jones Reviewed-by: Evan Green Co-developed-by: Conor Dooley Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-masculine-saddlebag-67a94966b091@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 3af2d214ce21..f808b67f5a27 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -257,13 +257,16 @@ static void print_isa_ext(struct seq_file *f) */ static const char base_riscv_exts[13] = "imafdqcbkjpvh"; -static void print_isa(struct seq_file *f, const char *isa) +static void print_isa(struct seq_file *f) { int i; seq_puts(f, "isa\t\t: "); - /* Print the rv[64/32] part */ - seq_write(f, isa, 4); + if (IS_ENABLED(CONFIG_32BIT)) + seq_write(f, "rv32", 4); + else + seq_write(f, "rv64", 4); + for (i = 0; i < sizeof(base_riscv_exts); i++) { if (__riscv_isa_extension_available(NULL, base_riscv_exts[i] - 'a')) /* Print only enabled the base ISA extensions */ @@ -320,27 +323,21 @@ static int c_show(struct seq_file *m, void *v) unsigned long cpu_id = (unsigned long)v - 1; struct riscv_cpuinfo *ci = per_cpu_ptr(&riscv_cpuinfo, cpu_id); struct device_node *node; - const char *compat, *isa; + const char *compat; seq_printf(m, "processor\t: %lu\n", cpu_id); seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id)); + print_isa(m); + print_mmu(m); if (acpi_disabled) { node = of_get_cpu_node(cpu_id, NULL); - if (!of_property_read_string(node, "riscv,isa", &isa)) - print_isa(m, isa); - print_mmu(m); if (!of_property_read_string(node, "compatible", &compat) && strcmp(compat, "riscv")) seq_printf(m, "uarch\t\t: %s\n", compat); of_node_put(node); - } else { - if (!acpi_get_riscv_isa(NULL, cpu_id, &isa)) - print_isa(m, isa); - - print_mmu(m); } seq_printf(m, "mvendorid\t: 0x%lx\n", ci->mvendorid); From 131033689da2ce17fb1407f0d3e608a203be5e09 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:01 +0100 Subject: [PATCH 04/51] RISC-V: drop a needless check in print_isa_ext() isa_ext_arr cannot be empty, as some of the extensions within it are always built into the kernel. When this code was first added, back in commit a9b202606c69 ("RISC-V: Improve /proc/cpuinfo output for ISA extensions"), the array was empty and needed a dummy item & thus there could be no extensions present. When the first multi-letter ones did get added, it was Sscofpmf - which didn't have a Kconfig symbol to disable it. Remove this check, as it has been redundant since Sscofpmf was added. Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-veggie-mug-3d3bf6787ae2@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index f808b67f5a27..e721f15fdf17 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -237,10 +237,6 @@ static void print_isa_ext(struct seq_file *f) arr_sz = ARRAY_SIZE(isa_ext_arr) - 1; - /* No extension support available */ - if (arr_sz <= 0) - return; - for (i = 0; i <= arr_sz; i++) { edata = &isa_ext_arr[i]; if (!__riscv_isa_extension_available(NULL, edata->isa_ext_id)) From 8135ade32c0db48e2e75b039b2c2454f722762de Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:02 +0100 Subject: [PATCH 05/51] RISC-V: shunt isa_ext_arr to cpufeature.c To facilitate using one struct to define extensions, rather than having several, shunt isa_ext_arr to cpufeature.c, where it will be used for probing extension presence also. As that scope of the array as widened, prefix it with riscv & drop the type from the variable name. Since the new array is const, print_isa() needs a wee bit of cleanup to avoid complaints about losing the const qualifier. Reviewed-by: Andrew Jones Reviewed-by: Evan Green Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-spirits-upside-a2c61c65fd5a@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 3 ++ arch/riscv/kernel/cpu.c | 75 +--------------------------------- arch/riscv/kernel/cpufeature.c | 67 ++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 73 deletions(-) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index f041bfa7f6a0..7a57e6109aef 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -76,6 +76,9 @@ struct riscv_isa_ext_data { unsigned int isa_ext_id; }; +extern const struct riscv_isa_ext_data riscv_isa_ext[]; +extern const size_t riscv_isa_ext_count; + unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); #define riscv_isa_extension_mask(ext) BIT_MASK(RISCV_ISA_EXT_##ext) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index e721f15fdf17..bf93293d51f3 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -164,81 +164,10 @@ arch_initcall(riscv_cpuinfo_init); #ifdef CONFIG_PROC_FS -#define __RISCV_ISA_EXT_DATA(UPROP, EXTID) \ - { \ - .uprop = #UPROP, \ - .isa_ext_id = EXTID, \ - } - -/* - * The canonical order of ISA extension names in the ISA string is defined in - * chapter 27 of the unprivileged specification. - * - * Ordinarily, for in-kernel data structures, this order is unimportant but - * isa_ext_arr defines the order of the ISA string in /proc/cpuinfo. - * - * The specification uses vague wording, such as should, when it comes to - * ordering, so for our purposes the following rules apply: - * - * 1. All multi-letter extensions must be separated from other extensions by an - * underscore. - * - * 2. Additional standard extensions (starting with 'Z') must be sorted after - * single-letter extensions and before any higher-privileged extensions. - - * 3. The first letter following the 'Z' conventionally indicates the most - * closely related alphabetical extension category, IMAFDQLCBKJTPVH. - * If multiple 'Z' extensions are named, they must be ordered first by - * category, then alphabetically within a category. - * - * 3. Standard supervisor-level extensions (starting with 'S') must be listed - * after standard unprivileged extensions. If multiple supervisor-level - * extensions are listed, they must be ordered alphabetically. - * - * 4. Standard machine-level extensions (starting with 'Zxm') must be listed - * after any lower-privileged, standard extensions. If multiple - * machine-level extensions are listed, they must be ordered - * alphabetically. - * - * 5. Non-standard extensions (starting with 'X') must be listed after all - * standard extensions. If multiple non-standard extensions are listed, they - * must be ordered alphabetically. - * - * An example string following the order is: - * rv64imadc_zifoo_zigoo_zafoo_sbar_scar_zxmbaz_xqux_xrux - * - * New entries to this struct should follow the ordering rules described above. - */ -static struct riscv_isa_ext_data isa_ext_arr[] = { - __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), - __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ), - __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), - __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), - __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI), - __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), - __RISCV_ISA_EXT_DATA(zihpm, RISCV_ISA_EXT_ZIHPM), - __RISCV_ISA_EXT_DATA(zba, RISCV_ISA_EXT_ZBA), - __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), - __RISCV_ISA_EXT_DATA(zbs, RISCV_ISA_EXT_ZBS), - __RISCV_ISA_EXT_DATA(smaia, RISCV_ISA_EXT_SMAIA), - __RISCV_ISA_EXT_DATA(ssaia, RISCV_ISA_EXT_SSAIA), - __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), - __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), - __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), - __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), - __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), - __RISCV_ISA_EXT_DATA("", RISCV_ISA_EXT_MAX), -}; - static void print_isa_ext(struct seq_file *f) { - struct riscv_isa_ext_data *edata; - int i = 0, arr_sz; - - arr_sz = ARRAY_SIZE(isa_ext_arr) - 1; - - for (i = 0; i <= arr_sz; i++) { - edata = &isa_ext_arr[i]; + for (int i = 0; i < riscv_isa_ext_count; i++) { + const struct riscv_isa_ext_data *edata = &riscv_isa_ext[i]; if (!__riscv_isa_extension_available(NULL, edata->isa_ext_id)) continue; seq_printf(f, "_%s", edata->uprop); diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index bdcf460ea53d..fb476153fffc 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -99,6 +99,73 @@ static bool riscv_isa_extension_check(int id) return true; } +#define __RISCV_ISA_EXT_DATA(UPROP, EXTID) \ + { \ + .uprop = #UPROP, \ + .isa_ext_id = EXTID, \ + } + +/* + * The canonical order of ISA extension names in the ISA string is defined in + * chapter 27 of the unprivileged specification. + * + * Ordinarily, for in-kernel data structures, this order is unimportant but + * isa_ext_arr defines the order of the ISA string in /proc/cpuinfo. + * + * The specification uses vague wording, such as should, when it comes to + * ordering, so for our purposes the following rules apply: + * + * 1. All multi-letter extensions must be separated from other extensions by an + * underscore. + * + * 2. Additional standard extensions (starting with 'Z') must be sorted after + * single-letter extensions and before any higher-privileged extensions. + * + * 3. The first letter following the 'Z' conventionally indicates the most + * closely related alphabetical extension category, IMAFDQLCBKJTPVH. + * If multiple 'Z' extensions are named, they must be ordered first by + * category, then alphabetically within a category. + * + * 3. Standard supervisor-level extensions (starting with 'S') must be listed + * after standard unprivileged extensions. If multiple supervisor-level + * extensions are listed, they must be ordered alphabetically. + * + * 4. Standard machine-level extensions (starting with 'Zxm') must be listed + * after any lower-privileged, standard extensions. If multiple + * machine-level extensions are listed, they must be ordered + * alphabetically. + * + * 5. Non-standard extensions (starting with 'X') must be listed after all + * standard extensions. If multiple non-standard extensions are listed, they + * must be ordered alphabetically. + * + * An example string following the order is: + * rv64imadc_zifoo_zigoo_zafoo_sbar_scar_zxmbaz_xqux_xrux + * + * New entries to this struct should follow the ordering rules described above. + */ +const struct riscv_isa_ext_data riscv_isa_ext[] = { + __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), + __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ), + __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), + __RISCV_ISA_EXT_DATA(zicsr, RISCV_ISA_EXT_ZICSR), + __RISCV_ISA_EXT_DATA(zifencei, RISCV_ISA_EXT_ZIFENCEI), + __RISCV_ISA_EXT_DATA(zihintpause, RISCV_ISA_EXT_ZIHINTPAUSE), + __RISCV_ISA_EXT_DATA(zihpm, RISCV_ISA_EXT_ZIHPM), + __RISCV_ISA_EXT_DATA(zba, RISCV_ISA_EXT_ZBA), + __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB), + __RISCV_ISA_EXT_DATA(zbs, RISCV_ISA_EXT_ZBS), + __RISCV_ISA_EXT_DATA(smaia, RISCV_ISA_EXT_SMAIA), + __RISCV_ISA_EXT_DATA(ssaia, RISCV_ISA_EXT_SSAIA), + __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF), + __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC), + __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL), + __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT), + __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT), +}; + +const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext); + void __init riscv_fill_hwcap(void) { struct device_node *node; From 37f988dcec056532820fb7c3e9a8367fd19f6c1b Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:03 +0100 Subject: [PATCH 06/51] RISC-V: repurpose riscv_isa_ext array in riscv_fill_hwcap() In riscv_fill_hwcap() riscv_isa_ext array can be looped over, rather than duplicating the list of extensions with individual SET_ISA_EXT_MAP() usage. While at it, drop the statement-of-the-obvious comments from the struct, rename uprop to something more suitable for its new use & constify the members. Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-dastardly-affiliate-4cf819dccde2@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 7 ++----- arch/riscv/kernel/cpu.c | 5 +++-- arch/riscv/kernel/cpufeature.c | 30 +++++++++--------------------- 3 files changed, 14 insertions(+), 28 deletions(-) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 7a57e6109aef..2460ac2fc7ed 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -55,7 +55,6 @@ #define RISCV_ISA_EXT_ZIHPM 42 #define RISCV_ISA_EXT_MAX 64 -#define RISCV_ISA_EXT_NAME_LEN_MAX 32 #ifdef CONFIG_RISCV_M_MODE #define RISCV_ISA_EXT_SxAIA RISCV_ISA_EXT_SMAIA @@ -70,10 +69,8 @@ unsigned long riscv_get_elf_hwcap(void); struct riscv_isa_ext_data { - /* Name of the extension displayed to userspace via /proc/cpuinfo */ - char uprop[RISCV_ISA_EXT_NAME_LEN_MAX]; - /* The logical ISA extension ID */ - unsigned int isa_ext_id; + const unsigned int id; + const char *name; }; extern const struct riscv_isa_ext_data riscv_isa_ext[]; diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index bf93293d51f3..aa17eeb0ec9a 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -168,9 +168,10 @@ static void print_isa_ext(struct seq_file *f) { for (int i = 0; i < riscv_isa_ext_count; i++) { const struct riscv_isa_ext_data *edata = &riscv_isa_ext[i]; - if (!__riscv_isa_extension_available(NULL, edata->isa_ext_id)) + if (!__riscv_isa_extension_available(NULL, edata->id)) continue; - seq_printf(f, "_%s", edata->uprop); + + seq_printf(f, "_%s", edata->name); } } diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index fb476153fffc..859e3831fac1 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -99,11 +99,10 @@ static bool riscv_isa_extension_check(int id) return true; } -#define __RISCV_ISA_EXT_DATA(UPROP, EXTID) \ - { \ - .uprop = #UPROP, \ - .isa_ext_id = EXTID, \ - } +#define __RISCV_ISA_EXT_DATA(_name, _id) { \ + .name = #_name, \ + .id = _id, \ +} /* * The canonical order of ISA extension names in the ISA string is defined in @@ -350,8 +349,8 @@ void __init riscv_fill_hwcap(void) #define SET_ISA_EXT_MAP(name, bit) \ do { \ - if ((ext_end - ext == sizeof(name) - 1) && \ - !strncasecmp(ext, name, sizeof(name) - 1) && \ + if ((ext_end - ext == strlen(name)) && \ + !strncasecmp(ext, name, strlen(name)) && \ riscv_isa_extension_check(bit)) \ set_bit(bit, isainfo->isa); \ } while (false) \ @@ -366,20 +365,9 @@ void __init riscv_fill_hwcap(void) set_bit(nr, isainfo->isa); } } else { - /* sorted alphabetically */ - SET_ISA_EXT_MAP("smaia", RISCV_ISA_EXT_SMAIA); - SET_ISA_EXT_MAP("ssaia", RISCV_ISA_EXT_SSAIA); - SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF); - SET_ISA_EXT_MAP("sstc", RISCV_ISA_EXT_SSTC); - SET_ISA_EXT_MAP("svinval", RISCV_ISA_EXT_SVINVAL); - SET_ISA_EXT_MAP("svnapot", RISCV_ISA_EXT_SVNAPOT); - SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT); - SET_ISA_EXT_MAP("zba", RISCV_ISA_EXT_ZBA); - SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB); - SET_ISA_EXT_MAP("zbs", RISCV_ISA_EXT_ZBS); - SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM); - SET_ISA_EXT_MAP("zicboz", RISCV_ISA_EXT_ZICBOZ); - SET_ISA_EXT_MAP("zihintpause", RISCV_ISA_EXT_ZIHINTPAUSE); + for (int i = 0; i < riscv_isa_ext_count; i++) + SET_ISA_EXT_MAP(riscv_isa_ext[i].name, + riscv_isa_ext[i].id); } #undef SET_ISA_EXT_MAP } From c30556e318cc3d8e04349aea6505639ec8a5fbdc Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:04 +0100 Subject: [PATCH 07/51] RISC-V: add missing single letter extension definitions To facilitate adding single letter extensions to riscv_isa_ext, add definitions for the extensions present in base_riscv_exts that do not already have them. Reviewed-by: Andrew Jones Reviewed-by: Evan Green Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-train-feisty-93de38250f98@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 2460ac2fc7ed..a20e4ade1b53 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -14,12 +14,17 @@ #include #define RISCV_ISA_EXT_a ('a' - 'a') +#define RISCV_ISA_EXT_b ('b' - 'a') #define RISCV_ISA_EXT_c ('c' - 'a') #define RISCV_ISA_EXT_d ('d' - 'a') #define RISCV_ISA_EXT_f ('f' - 'a') #define RISCV_ISA_EXT_h ('h' - 'a') #define RISCV_ISA_EXT_i ('i' - 'a') +#define RISCV_ISA_EXT_j ('j' - 'a') +#define RISCV_ISA_EXT_k ('k' - 'a') #define RISCV_ISA_EXT_m ('m' - 'a') +#define RISCV_ISA_EXT_p ('p' - 'a') +#define RISCV_ISA_EXT_q ('q' - 'a') #define RISCV_ISA_EXT_s ('s' - 'a') #define RISCV_ISA_EXT_u ('u' - 'a') #define RISCV_ISA_EXT_v ('v' - 'a') From effc122ad17698996ce2e839c69695cd22645be1 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:05 +0100 Subject: [PATCH 08/51] RISC-V: add single letter extensions to riscv_isa_ext So that riscv_fill_hwcap() can use riscv_isa_ext to probe for single letter extensions, add them to it. As a result, what gets spat out in /proc/cpuinfo will become borked, as single letter extensions will be printed as part of the base extensions and while printing from riscv_isa_arr. Take the opportunity to unify the printing of the isa string, using the new member of riscv_isa_ext_data in the process. Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-despite-bright-de00ac888cc7@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 37 ++++++++++------------------------ arch/riscv/kernel/cpufeature.c | 13 ++++++++++++ 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index aa17eeb0ec9a..4f1f12f34b63 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -164,41 +164,26 @@ arch_initcall(riscv_cpuinfo_init); #ifdef CONFIG_PROC_FS -static void print_isa_ext(struct seq_file *f) -{ - for (int i = 0; i < riscv_isa_ext_count; i++) { - const struct riscv_isa_ext_data *edata = &riscv_isa_ext[i]; - if (!__riscv_isa_extension_available(NULL, edata->id)) - continue; - - seq_printf(f, "_%s", edata->name); - } -} - -/* - * These are the only valid base (single letter) ISA extensions as per the spec. - * It also specifies the canonical order in which it appears in the spec. - * Some of the extension may just be a place holder for now (B, K, P, J). - * This should be updated once corresponding extensions are ratified. - */ -static const char base_riscv_exts[13] = "imafdqcbkjpvh"; - static void print_isa(struct seq_file *f) { - int i; - seq_puts(f, "isa\t\t: "); + if (IS_ENABLED(CONFIG_32BIT)) seq_write(f, "rv32", 4); else seq_write(f, "rv64", 4); - for (i = 0; i < sizeof(base_riscv_exts); i++) { - if (__riscv_isa_extension_available(NULL, base_riscv_exts[i] - 'a')) - /* Print only enabled the base ISA extensions */ - seq_write(f, &base_riscv_exts[i], 1); + for (int i = 0; i < riscv_isa_ext_count; i++) { + if (!__riscv_isa_extension_available(NULL, riscv_isa_ext[i].id)) + continue; + + /* Only multi-letter extensions are split by underscores */ + if (strnlen(riscv_isa_ext[i].name, 2) != 1) + seq_puts(f, "_"); + + seq_printf(f, "%s", riscv_isa_ext[i].name); } - print_isa_ext(f); + seq_puts(f, "\n"); } diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 859e3831fac1..d6009bd4c186 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -144,6 +144,19 @@ static bool riscv_isa_extension_check(int id) * New entries to this struct should follow the ordering rules described above. */ const struct riscv_isa_ext_data riscv_isa_ext[] = { + __RISCV_ISA_EXT_DATA(i, RISCV_ISA_EXT_i), + __RISCV_ISA_EXT_DATA(m, RISCV_ISA_EXT_m), + __RISCV_ISA_EXT_DATA(a, RISCV_ISA_EXT_a), + __RISCV_ISA_EXT_DATA(f, RISCV_ISA_EXT_f), + __RISCV_ISA_EXT_DATA(d, RISCV_ISA_EXT_d), + __RISCV_ISA_EXT_DATA(q, RISCV_ISA_EXT_q), + __RISCV_ISA_EXT_DATA(c, RISCV_ISA_EXT_c), + __RISCV_ISA_EXT_DATA(b, RISCV_ISA_EXT_b), + __RISCV_ISA_EXT_DATA(k, RISCV_ISA_EXT_k), + __RISCV_ISA_EXT_DATA(j, RISCV_ISA_EXT_j), + __RISCV_ISA_EXT_DATA(p, RISCV_ISA_EXT_p), + __RISCV_ISA_EXT_DATA(v, RISCV_ISA_EXT_v), + __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h), __RISCV_ISA_EXT_DATA(zicbom, RISCV_ISA_EXT_ZICBOM), __RISCV_ISA_EXT_DATA(zicboz, RISCV_ISA_EXT_ZICBOZ), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), From 4265b0ec5ee7bf64639cc088f378a51ca335b8a4 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:06 +0100 Subject: [PATCH 09/51] RISC-V: split riscv_fill_hwcap() in 3 Before adding more complexity to it, split riscv_fill_hwcap() into 3 distinct sections: - riscv_fill_hwcap() still is the top level function, into which the additional complexity will be added. - riscv_fill_hwcap_from_isa_string() handles getting the information from the riscv,isa/ACPI equivalent across harts & the various quirks there - riscv_parse_isa_string() does what it says on the tin. Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-daylight-puritan-37aeb41a4d9b@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 345 +++++++++++++++++---------------- 1 file changed, 177 insertions(+), 168 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index d6009bd4c186..7c661b12ac8d 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -178,29 +178,172 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext); -void __init riscv_fill_hwcap(void) +static void __init riscv_parse_isa_string(unsigned long *this_hwcap, struct riscv_isainfo *isainfo, + unsigned long *isa2hwcap, const char *isa) +{ + /* + * For all possible cpus, we have already validated in + * the boot process that they at least contain "rv" and + * whichever of "32"/"64" this kernel supports, and so this + * section can be skipped. + */ + isa += 4; + + while (*isa) { + const char *ext = isa++; + const char *ext_end = isa; + bool ext_long = false, ext_err = false; + + switch (*ext) { + case 's': + /* + * Workaround for invalid single-letter 's' & 'u'(QEMU). + * No need to set the bit in riscv_isa as 's' & 'u' are + * not valid ISA extensions. It works until multi-letter + * extension starting with "Su" appears. + */ + if (ext[-1] != '_' && ext[1] == 'u') { + ++isa; + ext_err = true; + break; + } + fallthrough; + case 'S': + case 'x': + case 'X': + case 'z': + case 'Z': + /* + * Before attempting to parse the extension itself, we find its end. + * As multi-letter extensions must be split from other multi-letter + * extensions with an "_", the end of a multi-letter extension will + * either be the null character or the "_" at the start of the next + * multi-letter extension. + * + * Next, as the extensions version is currently ignored, we + * eliminate that portion. This is done by parsing backwards from + * the end of the extension, removing any numbers. This may be a + * major or minor number however, so the process is repeated if a + * minor number was found. + * + * ext_end is intended to represent the first character *after* the + * name portion of an extension, but will be decremented to the last + * character itself while eliminating the extensions version number. + * A simple re-increment solves this problem. + */ + ext_long = true; + for (; *isa && *isa != '_'; ++isa) + if (unlikely(!isalnum(*isa))) + ext_err = true; + + ext_end = isa; + if (unlikely(ext_err)) + break; + + if (!isdigit(ext_end[-1])) + break; + + while (isdigit(*--ext_end)) + ; + + if (tolower(ext_end[0]) != 'p' || !isdigit(ext_end[-1])) { + ++ext_end; + break; + } + + while (isdigit(*--ext_end)) + ; + + ++ext_end; + break; + default: + /* + * Things are a little easier for single-letter extensions, as they + * are parsed forwards. + * + * After checking that our starting position is valid, we need to + * ensure that, when isa was incremented at the start of the loop, + * that it arrived at the start of the next extension. + * + * If we are already on a non-digit, there is nothing to do. Either + * we have a multi-letter extension's _, or the start of an + * extension. + * + * Otherwise we have found the current extension's major version + * number. Parse past it, and a subsequent p/minor version number + * if present. The `p` extension must not appear immediately after + * a number, so there is no fear of missing it. + * + */ + if (unlikely(!isalpha(*ext))) { + ext_err = true; + break; + } + + if (!isdigit(*isa)) + break; + + while (isdigit(*++isa)) + ; + + if (tolower(*isa) != 'p') + break; + + if (!isdigit(*++isa)) { + --isa; + break; + } + + while (isdigit(*++isa)) + ; + + break; + } + + /* + * The parser expects that at the start of an iteration isa points to the + * first character of the next extension. As we stop parsing an extension + * on meeting a non-alphanumeric character, an extra increment is needed + * where the succeeding extension is a multi-letter prefixed with an "_". + */ + if (*isa == '_') + ++isa; + +#define SET_ISA_EXT_MAP(name, bit) \ + do { \ + if ((ext_end - ext == strlen(name)) && \ + !strncasecmp(ext, name, strlen(name)) && \ + riscv_isa_extension_check(bit)) \ + set_bit(bit, isainfo->isa); \ + } while (false) \ + + if (unlikely(ext_err)) + continue; + if (!ext_long) { + int nr = tolower(*ext) - 'a'; + + if (riscv_isa_extension_check(nr)) { + *this_hwcap |= isa2hwcap[nr]; + set_bit(nr, isainfo->isa); + } + } else { + for (int i = 0; i < riscv_isa_ext_count; i++) + SET_ISA_EXT_MAP(riscv_isa_ext[i].name, + riscv_isa_ext[i].id); + } +#undef SET_ISA_EXT_MAP + } +} + +static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) { struct device_node *node; const char *isa; - char print_str[NUM_ALPHA_EXTS + 1]; - int i, j, rc; - unsigned long isa2hwcap[26] = {0}; + int rc; struct acpi_table_header *rhct; acpi_status status; unsigned int cpu; - isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; - isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; - isa2hwcap['a' - 'a'] = COMPAT_HWCAP_ISA_A; - isa2hwcap['f' - 'a'] = COMPAT_HWCAP_ISA_F; - isa2hwcap['d' - 'a'] = COMPAT_HWCAP_ISA_D; - isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; - isa2hwcap['v' - 'a'] = COMPAT_HWCAP_ISA_V; - - elf_hwcap = 0; - - bitmap_zero(riscv_isa, RISCV_ISA_EXT_MAX); - if (!acpi_disabled) { status = acpi_get_table(ACPI_SIG_RHCT, 0, &rhct); if (ACPI_FAILURE(status)) @@ -232,158 +375,7 @@ void __init riscv_fill_hwcap(void) } } - /* - * For all possible cpus, we have already validated in - * the boot process that they at least contain "rv" and - * whichever of "32"/"64" this kernel supports, and so this - * section can be skipped. - */ - isa += 4; - - while (*isa) { - const char *ext = isa++; - const char *ext_end = isa; - bool ext_long = false, ext_err = false; - - switch (*ext) { - case 's': - /* - * Workaround for invalid single-letter 's' & 'u'(QEMU). - * No need to set the bit in riscv_isa as 's' & 'u' are - * not valid ISA extensions. It works until multi-letter - * extension starting with "Su" appears. - */ - if (ext[-1] != '_' && ext[1] == 'u') { - ++isa; - ext_err = true; - break; - } - fallthrough; - case 'S': - case 'x': - case 'X': - case 'z': - case 'Z': - /* - * Before attempting to parse the extension itself, we find its end. - * As multi-letter extensions must be split from other multi-letter - * extensions with an "_", the end of a multi-letter extension will - * either be the null character or the "_" at the start of the next - * multi-letter extension. - * - * Next, as the extensions version is currently ignored, we - * eliminate that portion. This is done by parsing backwards from - * the end of the extension, removing any numbers. This may be a - * major or minor number however, so the process is repeated if a - * minor number was found. - * - * ext_end is intended to represent the first character *after* the - * name portion of an extension, but will be decremented to the last - * character itself while eliminating the extensions version number. - * A simple re-increment solves this problem. - */ - ext_long = true; - for (; *isa && *isa != '_'; ++isa) - if (unlikely(!isalnum(*isa))) - ext_err = true; - - ext_end = isa; - if (unlikely(ext_err)) - break; - - if (!isdigit(ext_end[-1])) - break; - - while (isdigit(*--ext_end)) - ; - - if (tolower(ext_end[0]) != 'p' || !isdigit(ext_end[-1])) { - ++ext_end; - break; - } - - while (isdigit(*--ext_end)) - ; - - ++ext_end; - break; - default: - /* - * Things are a little easier for single-letter extensions, as they - * are parsed forwards. - * - * After checking that our starting position is valid, we need to - * ensure that, when isa was incremented at the start of the loop, - * that it arrived at the start of the next extension. - * - * If we are already on a non-digit, there is nothing to do. Either - * we have a multi-letter extension's _, or the start of an - * extension. - * - * Otherwise we have found the current extension's major version - * number. Parse past it, and a subsequent p/minor version number - * if present. The `p` extension must not appear immediately after - * a number, so there is no fear of missing it. - * - */ - if (unlikely(!isalpha(*ext))) { - ext_err = true; - break; - } - - if (!isdigit(*isa)) - break; - - while (isdigit(*++isa)) - ; - - if (tolower(*isa) != 'p') - break; - - if (!isdigit(*++isa)) { - --isa; - break; - } - - while (isdigit(*++isa)) - ; - - break; - } - - /* - * The parser expects that at the start of an iteration isa points to the - * first character of the next extension. As we stop parsing an extension - * on meeting a non-alphanumeric character, an extra increment is needed - * where the succeeding extension is a multi-letter prefixed with an "_". - */ - if (*isa == '_') - ++isa; - -#define SET_ISA_EXT_MAP(name, bit) \ - do { \ - if ((ext_end - ext == strlen(name)) && \ - !strncasecmp(ext, name, strlen(name)) && \ - riscv_isa_extension_check(bit)) \ - set_bit(bit, isainfo->isa); \ - } while (false) \ - - if (unlikely(ext_err)) - continue; - if (!ext_long) { - int nr = tolower(*ext) - 'a'; - - if (riscv_isa_extension_check(nr)) { - this_hwcap |= isa2hwcap[nr]; - set_bit(nr, isainfo->isa); - } - } else { - for (int i = 0; i < riscv_isa_ext_count; i++) - SET_ISA_EXT_MAP(riscv_isa_ext[i].name, - riscv_isa_ext[i].id); - } -#undef SET_ISA_EXT_MAP - } + riscv_parse_isa_string(&this_hwcap, isainfo, isa2hwcap, isa); /* * Linux requires the following extensions, so we may as well @@ -420,6 +412,23 @@ void __init riscv_fill_hwcap(void) if (!acpi_disabled && rhct) acpi_put_table((struct acpi_table_header *)rhct); +} + +void __init riscv_fill_hwcap(void) +{ + char print_str[NUM_ALPHA_EXTS + 1]; + int i, j; + unsigned long isa2hwcap[26] = {0}; + + isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; + isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; + isa2hwcap['a' - 'a'] = COMPAT_HWCAP_ISA_A; + isa2hwcap['f' - 'a'] = COMPAT_HWCAP_ISA_F; + isa2hwcap['d' - 'a'] = COMPAT_HWCAP_ISA_D; + isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; + isa2hwcap['v' - 'a'] = COMPAT_HWCAP_ISA_V; + + riscv_fill_hwcap_from_isa_string(isa2hwcap); /* We don't support systems with F but without D, so mask those out * here. */ From 90700a4fbfaf30bc792b72ddda5666a19ddd6c6a Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:07 +0100 Subject: [PATCH 10/51] RISC-V: enable extension detection from dedicated properties Add support for parsing the new riscv,isa-extensions property in riscv_fill_hwcap(), by means of a new "property" member of the riscv_isa_ext_data struct. For now, this shadows the name of the extension for all users, however this may not be the case for all extensions, based on how the dt-binding is written. For the sake of backwards compatibility, fall back to the old scheme if the new properties are not detected. For now, just inform, rather than warn, when that happens. Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-vocation-profane-39a74b3c2649@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 1 + arch/riscv/kernel/cpufeature.c | 78 ++++++++++++++++++++++++++++++++-- 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index a20e4ade1b53..e3cda14a486b 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -76,6 +76,7 @@ unsigned long riscv_get_elf_hwcap(void); struct riscv_isa_ext_data { const unsigned int id; const char *name; + const char *property; }; extern const struct riscv_isa_ext_data riscv_isa_ext[]; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 7c661b12ac8d..fdc71e52dc2b 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -101,6 +101,7 @@ static bool riscv_isa_extension_check(int id) #define __RISCV_ISA_EXT_DATA(_name, _id) { \ .name = #_name, \ + .property = #_name, \ .id = _id, \ } @@ -414,11 +415,69 @@ static void __init riscv_fill_hwcap_from_isa_string(unsigned long *isa2hwcap) acpi_put_table((struct acpi_table_header *)rhct); } +static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + unsigned long this_hwcap = 0; + struct device_node *cpu_node; + struct riscv_isainfo *isainfo = &hart_isa[cpu]; + + cpu_node = of_cpu_device_node_get(cpu); + if (!cpu_node) { + pr_warn("Unable to find cpu node\n"); + continue; + } + + if (!of_property_present(cpu_node, "riscv,isa-extensions")) { + of_node_put(cpu_node); + continue; + } + + for (int i = 0; i < riscv_isa_ext_count; i++) { + if (of_property_match_string(cpu_node, "riscv,isa-extensions", + riscv_isa_ext[i].property) < 0) + continue; + + if (!riscv_isa_extension_check(riscv_isa_ext[i].id)) + continue; + + /* Only single letter extensions get set in hwcap */ + if (strnlen(riscv_isa_ext[i].name, 2) == 1) + this_hwcap |= isa2hwcap[riscv_isa_ext[i].id]; + + set_bit(riscv_isa_ext[i].id, isainfo->isa); + } + + of_node_put(cpu_node); + + /* + * All "okay" harts should have same isa. Set HWCAP based on + * common capabilities of every "okay" hart, in case they don't. + */ + if (elf_hwcap) + elf_hwcap &= this_hwcap; + else + elf_hwcap = this_hwcap; + + if (bitmap_empty(riscv_isa, RISCV_ISA_EXT_MAX)) + bitmap_copy(riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); + else + bitmap_and(riscv_isa, riscv_isa, isainfo->isa, RISCV_ISA_EXT_MAX); + } + + if (bitmap_empty(riscv_isa, RISCV_ISA_EXT_MAX)) + return -ENOENT; + + return 0; +} + void __init riscv_fill_hwcap(void) { char print_str[NUM_ALPHA_EXTS + 1]; - int i, j; unsigned long isa2hwcap[26] = {0}; + int i, j; isa2hwcap['i' - 'a'] = COMPAT_HWCAP_ISA_I; isa2hwcap['m' - 'a'] = COMPAT_HWCAP_ISA_M; @@ -428,10 +487,21 @@ void __init riscv_fill_hwcap(void) isa2hwcap['c' - 'a'] = COMPAT_HWCAP_ISA_C; isa2hwcap['v' - 'a'] = COMPAT_HWCAP_ISA_V; - riscv_fill_hwcap_from_isa_string(isa2hwcap); + if (!acpi_disabled) { + riscv_fill_hwcap_from_isa_string(isa2hwcap); + } else { + int ret = riscv_fill_hwcap_from_ext_list(isa2hwcap); - /* We don't support systems with F but without D, so mask those out - * here. */ + if (ret) { + pr_info("Falling back to deprecated \"riscv,isa\"\n"); + riscv_fill_hwcap_from_isa_string(isa2hwcap); + } + } + + /* + * We don't support systems with F but without D, so mask those out + * here. + */ if ((elf_hwcap & COMPAT_HWCAP_ISA_F) && !(elf_hwcap & COMPAT_HWCAP_ISA_D)) { pr_info("This kernel does not support systems with F but not D\n"); elf_hwcap &= ~COMPAT_HWCAP_ISA_F; From c98f136aedbd4384311b06db4ec9d2fd1996cffc Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:08 +0100 Subject: [PATCH 11/51] RISC-V: try new extension properties in of_early_processor_hartid() To fully deprecate the kernel's use of "riscv,isa", of_early_processor_hartid() needs to first try using the new properties, before falling back to "riscv,isa". Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-tablet-jimmy-987fea0eb2e1@wendy Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 4f1f12f34b63..28d5af21f544 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -61,8 +61,35 @@ int riscv_early_of_processor_hartid(struct device_node *node, unsigned long *har return -ENODEV; } + if (of_property_read_string(node, "riscv,isa-base", &isa)) + goto old_interface; + + if (IS_ENABLED(CONFIG_32BIT) && strncasecmp(isa, "rv32i", 5)) { + pr_warn("CPU with hartid=%lu does not support rv32i", *hart); + return -ENODEV; + } + + if (IS_ENABLED(CONFIG_64BIT) && strncasecmp(isa, "rv64i", 5)) { + pr_warn("CPU with hartid=%lu does not support rv64i", *hart); + return -ENODEV; + } + + if (!of_property_present(node, "riscv,isa-extensions")) + return -ENODEV; + + if (of_property_match_string(node, "riscv,isa-extensions", "i") < 0 || + of_property_match_string(node, "riscv,isa-extensions", "m") < 0 || + of_property_match_string(node, "riscv,isa-extensions", "a") < 0) { + pr_warn("CPU with hartid=%lu does not support ima", *hart); + return -ENODEV; + } + + return 0; + +old_interface: if (of_property_read_string(node, "riscv,isa", &isa)) { - pr_warn("CPU with hartid=%lu has no \"riscv,isa\" property\n", *hart); + pr_warn("CPU with hartid=%lu has no \"riscv,isa-base\" or \"riscv,isa\" property\n", + *hart); return -ENODEV; } From 496ea826d1e1f9744e2a4c36043db933deebab43 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Thu, 13 Jul 2023 13:11:09 +0100 Subject: [PATCH 12/51] RISC-V: provide Kconfig & commandline options to control parsing "riscv,isa" As it says on the tin, provide Kconfig option to control parsing the "riscv,isa" devicetree property. If either option is used, the kernel will fall back to parsing "riscv,isa", where "riscv,isa-base" and "riscv,isa-extensions" are not present. The Kconfig options are set up so that the default kernel configuration will enable the fallback path, without needing the commandline option. Suggested-by: Andrew Jones Suggested-by: Palmer Dabbelt Reviewed-by: Andrew Jones Signed-off-by: Conor Dooley Link: https://lore.kernel.org/r/20230713-aviator-plausibly-a35662485c2c@wendy Signed-off-by: Palmer Dabbelt --- .../admin-guide/kernel-parameters.txt | 7 +++++++ arch/riscv/Kconfig | 18 ++++++++++++++++++ arch/riscv/include/asm/hwcap.h | 1 + arch/riscv/kernel/cpu.c | 8 +++++++- arch/riscv/kernel/cpufeature.c | 14 +++++++++++++- 5 files changed, 46 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..bdc3fa712e92 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5468,6 +5468,13 @@ [KNL] Disable ring 3 MONITOR/MWAIT feature on supported CPUs. + riscv_isa_fallback [RISCV] + When CONFIG_RISCV_ISA_FALLBACK is not enabled, permit + falling back to detecting extension support by parsing + "riscv,isa" property on devicetree systems when the + replacement properties are not found. See the Kconfig + entry for RISCV_ISA_FALLBACK. + ro [KNL] Mount root device read-only on boot rodata= [KNL] diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..f52dd125ac5e 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -848,6 +848,24 @@ config XIP_PHYS_ADDR be linked for and stored to. This address is dependent on your own flash usage. +config RISCV_ISA_FALLBACK + bool "Permit falling back to parsing riscv,isa for extension support by default" + default y + help + Parsing the "riscv,isa" devicetree property has been deprecated and + replaced by a list of explicitly defined strings. For compatibility + with existing platforms, the kernel will fall back to parsing the + "riscv,isa" property if the replacements are not found. + + Selecting N here will result in a kernel that does not use the + fallback, unless the commandline "riscv_isa_fallback" parameter is + present. + + Please see the dt-binding, located at + Documentation/devicetree/bindings/riscv/extensions.yaml for details + on the replacement properties, "riscv,isa-base" and + "riscv,isa-extensions". + endmenu # "Boot options" config BUILTIN_DTB diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index e3cda14a486b..b7b58258f6c7 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -81,6 +81,7 @@ struct riscv_isa_ext_data { extern const struct riscv_isa_ext_data riscv_isa_ext[]; extern const size_t riscv_isa_ext_count; +extern bool riscv_isa_fallback; unsigned long riscv_isa_extension_base(const unsigned long *isa_bitmap); diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 28d5af21f544..208f1a700121 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -41,7 +41,7 @@ int riscv_of_processor_hartid(struct device_node *node, unsigned long *hart) return 0; } -int riscv_early_of_processor_hartid(struct device_node *node, unsigned long *hart) +int __init riscv_early_of_processor_hartid(struct device_node *node, unsigned long *hart) { const char *isa; @@ -87,6 +87,12 @@ int riscv_early_of_processor_hartid(struct device_node *node, unsigned long *har return 0; old_interface: + if (!riscv_isa_fallback) { + pr_warn("CPU with hartid=%lu is invalid: this kernel does not parse \"riscv,isa\"", + *hart); + return -ENODEV; + } + if (of_property_read_string(node, "riscv,isa", &isa)) { pr_warn("CPU with hartid=%lu has no \"riscv,isa-base\" or \"riscv,isa\" property\n", *hart); diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index fdc71e52dc2b..71fb840ee246 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -473,6 +473,18 @@ static int __init riscv_fill_hwcap_from_ext_list(unsigned long *isa2hwcap) return 0; } +#ifdef CONFIG_RISCV_ISA_FALLBACK +bool __initdata riscv_isa_fallback = true; +#else +bool __initdata riscv_isa_fallback; +static int __init riscv_isa_fallback_setup(char *__unused) +{ + riscv_isa_fallback = true; + return 1; +} +early_param("riscv_isa_fallback", riscv_isa_fallback_setup); +#endif + void __init riscv_fill_hwcap(void) { char print_str[NUM_ALPHA_EXTS + 1]; @@ -492,7 +504,7 @@ void __init riscv_fill_hwcap(void) } else { int ret = riscv_fill_hwcap_from_ext_list(isa2hwcap); - if (ret) { + if (ret && riscv_isa_fallback) { pr_info("Falling back to deprecated \"riscv,isa\"\n"); riscv_fill_hwcap_from_isa_string(isa2hwcap); } From 12d61a1bc28eef8d799ba00f370f421f7e942629 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Wed, 2 Aug 2023 00:21:58 +0000 Subject: [PATCH 13/51] RISC-V: cpu: refactor deprecated strncpy `strncpy` is deprecated for use on NUL-terminated destination strings [1]. Favor not copying strings onto stack and instead use strings directly. This avoids hard-coding sizes and buffer lengths all together. Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Suggested-by: Kees Cook Signed-off-by: Justin Stitt Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Reviewed-by: Kees Cook Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230802-arch-riscv-kernel-v2-1-24266e85bc96@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c index 208f1a700121..7b793c4321bb 100644 --- a/arch/riscv/kernel/cpu.c +++ b/arch/riscv/kernel/cpu.c @@ -222,21 +222,21 @@ static void print_isa(struct seq_file *f) static void print_mmu(struct seq_file *f) { - char sv_type[16]; + const char *sv_type; #ifdef CONFIG_MMU #if defined(CONFIG_32BIT) - strncpy(sv_type, "sv32", 5); + sv_type = "sv32"; #elif defined(CONFIG_64BIT) if (pgtable_l5_enabled) - strncpy(sv_type, "sv57", 5); + sv_type = "sv57"; else if (pgtable_l4_enabled) - strncpy(sv_type, "sv48", 5); + sv_type = "sv48"; else - strncpy(sv_type, "sv39", 5); + sv_type = "sv39"; #endif #else - strncpy(sv_type, "none", 5); + sv_type = "none"; #endif /* CONFIG_MMU */ seq_printf(f, "mmu\t\t: %s\n", sv_type); } From 174e8ac0272d54a9c1cc23185665f715c36620ad Mon Sep 17 00:00:00 2001 From: Yuan Tan Date: Mon, 24 Jul 2023 00:51:55 +0800 Subject: [PATCH 14/51] riscv: alternatives: fix a typo in comment In the usage of ALTERNATIVE, "always" is misspelled as "alwyas". Signed-off-by: Yuan Tan Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230723165155.4896-1-tanyuan@tinylab.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/alternative-macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/alternative-macros.h b/arch/riscv/include/asm/alternative-macros.h index b8c55fb3ab2c..721ec275ce57 100644 --- a/arch/riscv/include/asm/alternative-macros.h +++ b/arch/riscv/include/asm/alternative-macros.h @@ -146,7 +146,7 @@ * vendor_id: The CPU vendor ID. * patch_id: The patch ID (erratum ID or cpufeature ID). * CONFIG_k: The Kconfig of this patch ID. When Kconfig is disabled, the old - * content will alwyas be executed. + * content will always be executed. */ #define ALTERNATIVE(old_content, new_content, vendor_id, patch_id, CONFIG_k) \ _ALTERNATIVE_CFG(old_content, new_content, vendor_id, patch_id, CONFIG_k) From 366d259ff597e81d90639ae21269b3f82cd4ebb7 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:19 +0200 Subject: [PATCH 15/51] perf: Fix wrong comment about default event_idx Since commit c719f56092ad ("perf: Fix and clean up initialization of pmu::event_idx"), event_idx default implementation has returned 0, not idx + 1, so fix the comment that can be misleading. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Atish Patra --- include/linux/perf_event.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2166a69e3bf2..1269c96bc3b6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -445,7 +445,8 @@ struct pmu { /* * Will return the value for perf_event_mmap_page::index for this event, - * if no implementation is provided it will default to: event->hw.idx + 1. + * if no implementation is provided it will default to 0 (see + * perf_event_idx_default). */ int (*event_idx) (struct perf_event *event); /*optional */ From f117ae55b0198dd6fc36ce521e06d8b44a4bb203 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:20 +0200 Subject: [PATCH 16/51] include: riscv: Fix wrong include guard in riscv_pmu.h The current include guard prevents the inclusion of asm/perf_event.h which uses the same include guard: fix the one in riscv_pmu.h so that it matches the file name. Signed-off-by: Alexandre Ghiti Reviewed-by: Conor Dooley Reviewed-by: Andrew Jones Reviewed-by: Atish Patra --- include/linux/perf/riscv_pmu.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h index 43fc892aa7d9..9f70d94942e0 100644 --- a/include/linux/perf/riscv_pmu.h +++ b/include/linux/perf/riscv_pmu.h @@ -6,8 +6,8 @@ * */ -#ifndef _ASM_RISCV_PERF_EVENT_H -#define _ASM_RISCV_PERF_EVENT_H +#ifndef _RISCV_PMU_H +#define _RISCV_PMU_H #include #include @@ -81,4 +81,4 @@ int riscv_pmu_get_hpm_info(u32 *hw_ctr_width, u32 *num_hw_ctr); #endif /* CONFIG_RISCV_PMU */ -#endif /* _ASM_RISCV_PERF_EVENT_H */ +#endif /* _RISCV_PMU_H */ From e8b785e98abb0cc12edd4ba2d3ccad767ab8b68d Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:21 +0200 Subject: [PATCH 17/51] riscv: Make legacy counter enum match the HW numbering RISCV_PMU_LEGACY_INSTRET used to be set to 1 whereas the offset of this hardware counter from CSR_CYCLE is actually 2: make this offset match the real hw offset so that we can directly expose those values to userspace. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Atish Patra --- drivers/perf/riscv_pmu_legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/riscv_pmu_legacy.c b/drivers/perf/riscv_pmu_legacy.c index ca9e20bfc7ac..6a000abc28bb 100644 --- a/drivers/perf/riscv_pmu_legacy.c +++ b/drivers/perf/riscv_pmu_legacy.c @@ -13,7 +13,7 @@ #include #define RISCV_PMU_LEGACY_CYCLE 0 -#define RISCV_PMU_LEGACY_INSTRET 1 +#define RISCV_PMU_LEGACY_INSTRET 2 static bool pmu_init_done; From d5ac062d82d87124ac75e4273e3887578a7fae60 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:22 +0200 Subject: [PATCH 18/51] drivers: perf: Rename riscv pmu sbi driver That's just cosmetic, no functional changes. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Atish Patra --- drivers/perf/riscv_pmu_sbi.c | 4 ++-- include/linux/perf/riscv_pmu.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 4163ff517471..760eb2afcf82 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -907,7 +907,7 @@ out_free: static struct platform_driver pmu_sbi_driver = { .probe = pmu_sbi_device_probe, .driver = { - .name = RISCV_PMU_PDEV_NAME, + .name = RISCV_PMU_SBI_PDEV_NAME, }, }; @@ -934,7 +934,7 @@ static int __init pmu_sbi_devinit(void) if (ret) return ret; - pdev = platform_device_register_simple(RISCV_PMU_PDEV_NAME, -1, NULL, 0); + pdev = platform_device_register_simple(RISCV_PMU_SBI_PDEV_NAME, -1, NULL, 0); if (IS_ERR(pdev)) { platform_driver_unregister(&pmu_sbi_driver); return PTR_ERR(pdev); diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h index 9f70d94942e0..5deeea0be7cb 100644 --- a/include/linux/perf/riscv_pmu.h +++ b/include/linux/perf/riscv_pmu.h @@ -21,7 +21,7 @@ #define RISCV_MAX_COUNTERS 64 #define RISCV_OP_UNSUPP (-EOPNOTSUPP) -#define RISCV_PMU_PDEV_NAME "riscv-pmu" +#define RISCV_PMU_SBI_PDEV_NAME "riscv-pmu-sbi" #define RISCV_PMU_LEGACY_PDEV_NAME "riscv-pmu-legacy" #define RISCV_PMU_STOP_FLAG_RESET 1 From 83c5e13b8cbbed9479cf568e03a5010d827e9781 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:23 +0200 Subject: [PATCH 19/51] riscv: Prepare for user-space perf event mmap support Provide all the necessary bits in the generic riscv pmu driver to be able to mmap perf events in userspace: the heavy lifting lies in the driver backend, namely the legacy and sbi implementations. Note that arch_perf_update_userpage is almost a copy of arm64 code. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Atish Patra --- drivers/perf/riscv_pmu.c | 105 +++++++++++++++++++++++++++++++++ include/linux/perf/riscv_pmu.h | 4 ++ 2 files changed, 109 insertions(+) diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c index ebca5eab9c9b..432ad2e80ce3 100644 --- a/drivers/perf/riscv_pmu.c +++ b/drivers/perf/riscv_pmu.c @@ -14,9 +14,73 @@ #include #include #include +#include #include +static bool riscv_perf_user_access(struct perf_event *event) +{ + return ((event->attr.type == PERF_TYPE_HARDWARE) || + (event->attr.type == PERF_TYPE_HW_CACHE) || + (event->attr.type == PERF_TYPE_RAW)) && + !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); +} + +void arch_perf_update_userpage(struct perf_event *event, + struct perf_event_mmap_page *userpg, u64 now) +{ + struct clock_read_data *rd; + unsigned int seq; + u64 ns; + + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_time_short = 0; + userpg->cap_user_rdpmc = riscv_perf_user_access(event); + + userpg->pmc_width = 64; + + do { + rd = sched_clock_read_begin(&seq); + + userpg->time_mult = rd->mult; + userpg->time_shift = rd->shift; + userpg->time_zero = rd->epoch_ns; + userpg->time_cycles = rd->epoch_cyc; + userpg->time_mask = rd->sched_clock_mask; + + /* + * Subtract the cycle base, such that software that + * doesn't know about cap_user_time_short still 'works' + * assuming no wraps. + */ + ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift); + userpg->time_zero -= ns; + + } while (sched_clock_read_retry(seq)); + + userpg->time_offset = userpg->time_zero - now; + + /* + * time_shift is not expected to be greater than 31 due to + * the original published conversion algorithm shifting a + * 32-bit value (now specifies a 64-bit value) - refer + * perf_event_mmap_page documentation in perf_event.h. + */ + if (userpg->time_shift == 32) { + userpg->time_shift = 31; + userpg->time_mult >>= 1; + } + + /* + * Internal timekeeping for enabled/running/stopped times + * is always computed with the sched_clock. + */ + userpg->cap_user_time = 1; + userpg->cap_user_time_zero = 1; + userpg->cap_user_time_short = 1; +} + static unsigned long csr_read_num(int csr_num) { #define switchcase_csr_read(__csr_num, __val) {\ @@ -171,6 +235,8 @@ int riscv_pmu_event_set_period(struct perf_event *event) local64_set(&hwc->prev_count, (u64)-left); + perf_event_update_userpage(event); + return overflow; } @@ -267,6 +333,9 @@ static int riscv_pmu_event_init(struct perf_event *event) hwc->idx = -1; hwc->event_base = mapped_event; + if (rvpmu->event_init) + rvpmu->event_init(event); + if (!is_sampling_event(event)) { /* * For non-sampling runs, limit the sample_period to half @@ -283,6 +352,39 @@ static int riscv_pmu_event_init(struct perf_event *event) return 0; } +static int riscv_pmu_event_idx(struct perf_event *event) +{ + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu); + + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) + return 0; + + if (rvpmu->csr_index) + return rvpmu->csr_index(event) + 1; + + return 0; +} + +static void riscv_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) +{ + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu); + + if (rvpmu->event_mapped) { + rvpmu->event_mapped(event, mm); + perf_event_update_userpage(event); + } +} + +static void riscv_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) +{ + struct riscv_pmu *rvpmu = to_riscv_pmu(event->pmu); + + if (rvpmu->event_unmapped) { + rvpmu->event_unmapped(event, mm); + perf_event_update_userpage(event); + } +} + struct riscv_pmu *riscv_pmu_alloc(void) { struct riscv_pmu *pmu; @@ -307,6 +409,9 @@ struct riscv_pmu *riscv_pmu_alloc(void) } pmu->pmu = (struct pmu) { .event_init = riscv_pmu_event_init, + .event_mapped = riscv_pmu_event_mapped, + .event_unmapped = riscv_pmu_event_unmapped, + .event_idx = riscv_pmu_event_idx, .add = riscv_pmu_add, .del = riscv_pmu_del, .start = riscv_pmu_start, diff --git a/include/linux/perf/riscv_pmu.h b/include/linux/perf/riscv_pmu.h index 5deeea0be7cb..43282e22ebe1 100644 --- a/include/linux/perf/riscv_pmu.h +++ b/include/linux/perf/riscv_pmu.h @@ -55,6 +55,10 @@ struct riscv_pmu { void (*ctr_start)(struct perf_event *event, u64 init_val); void (*ctr_stop)(struct perf_event *event, unsigned long flag); int (*event_map)(struct perf_event *event, u64 *config); + void (*event_init)(struct perf_event *event); + void (*event_mapped)(struct perf_event *event, struct mm_struct *mm); + void (*event_unmapped)(struct perf_event *event, struct mm_struct *mm); + uint8_t (*csr_index)(struct perf_event *event); struct cpu_hw_events __percpu *hw_events; struct hlist_node node; From 50be342829053d6d4a3c66eacc0e778f6611a37a Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:24 +0200 Subject: [PATCH 20/51] drivers: perf: Implement perf event mmap support in the legacy backend Implement the needed callbacks in the legacy driver so that we can directly access the counters through perf in userspace. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Atish Patra --- drivers/perf/riscv_pmu_legacy.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/perf/riscv_pmu_legacy.c b/drivers/perf/riscv_pmu_legacy.c index 6a000abc28bb..79fdd667922e 100644 --- a/drivers/perf/riscv_pmu_legacy.c +++ b/drivers/perf/riscv_pmu_legacy.c @@ -71,6 +71,29 @@ static void pmu_legacy_ctr_start(struct perf_event *event, u64 ival) local64_set(&hwc->prev_count, initial_val); } +static uint8_t pmu_legacy_csr_index(struct perf_event *event) +{ + return event->hw.idx; +} + +static void pmu_legacy_event_mapped(struct perf_event *event, struct mm_struct *mm) +{ + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES && + event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) + return; + + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; +} + +static void pmu_legacy_event_unmapped(struct perf_event *event, struct mm_struct *mm) +{ + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES && + event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) + return; + + event->hw.flags &= ~PERF_EVENT_FLAG_USER_READ_CNT; +} + /* * This is just a simple implementation to allow legacy implementations * compatible with new RISC-V PMU driver framework. @@ -91,6 +114,9 @@ static void pmu_legacy_init(struct riscv_pmu *pmu) pmu->ctr_get_width = NULL; pmu->ctr_clear_idx = NULL; pmu->ctr_read = pmu_legacy_read_ctr; + pmu->event_mapped = pmu_legacy_event_mapped; + pmu->event_unmapped = pmu_legacy_event_unmapped; + pmu->csr_index = pmu_legacy_csr_index; perf_pmu_register(&pmu->pmu, "cpu", PERF_TYPE_RAW); } From cc4c07c89aada16229084eeb93895c95b7eabaa3 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:25 +0200 Subject: [PATCH 21/51] drivers: perf: Implement perf event mmap support in the SBI backend We used to unconditionnally expose the cycle and instret csrs to userspace, which gives rise to security concerns. So now we only allow access to hw counters from userspace through the perf framework which will handle context switches, per-task events...etc. A sysctl allows to revert the behaviour to the legacy mode so that userspace applications which are not ready for this change do not break. But the default value is to allow userspace only through perf: this will break userspace applications which rely on direct access to rdcycle. This choice was made for security reasons [1][2]: most of the applications which use rdcycle can instead use rdtime to count the elapsed time. [1] https://groups.google.com/a/groups.riscv.org/g/sw-dev/c/REWcwYnzsKE?pli=1 [2] https://www.youtube.com/watch?v=3-c4C_L2PRQ&ab_channel=IEEESymposiumonSecurityandPrivacy Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones --- drivers/perf/riscv_pmu.c | 10 +- drivers/perf/riscv_pmu_sbi.c | 192 +++++++++++++++++++++++++++++++++-- 2 files changed, 195 insertions(+), 7 deletions(-) diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c index 432ad2e80ce3..80c052e93f9e 100644 --- a/drivers/perf/riscv_pmu.c +++ b/drivers/perf/riscv_pmu.c @@ -38,7 +38,15 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time_short = 0; userpg->cap_user_rdpmc = riscv_perf_user_access(event); - userpg->pmc_width = 64; +#ifdef CONFIG_RISCV_PMU + /* + * The counters are 64-bit but the priv spec doesn't mandate all the + * bits to be implemented: that's why, counter width can vary based on + * the cpu vendor. + */ + if (userpg->cap_user_rdpmc) + userpg->pmc_width = to_riscv_pmu(event->pmu)->ctr_get_width(event->hw.idx) + 1; +#endif do { rd = sched_clock_read_begin(&seq); diff --git a/drivers/perf/riscv_pmu_sbi.c b/drivers/perf/riscv_pmu_sbi.c index 760eb2afcf82..9a51053b1f99 100644 --- a/drivers/perf/riscv_pmu_sbi.c +++ b/drivers/perf/riscv_pmu_sbi.c @@ -24,6 +24,14 @@ #include #include +#define SYSCTL_NO_USER_ACCESS 0 +#define SYSCTL_USER_ACCESS 1 +#define SYSCTL_LEGACY 2 + +#define PERF_EVENT_FLAG_NO_USER_ACCESS BIT(SYSCTL_NO_USER_ACCESS) +#define PERF_EVENT_FLAG_USER_ACCESS BIT(SYSCTL_USER_ACCESS) +#define PERF_EVENT_FLAG_LEGACY BIT(SYSCTL_LEGACY) + PMU_FORMAT_ATTR(event, "config:0-47"); PMU_FORMAT_ATTR(firmware, "config:63"); @@ -43,6 +51,9 @@ static const struct attribute_group *riscv_pmu_attr_groups[] = { NULL, }; +/* Allow user mode access by default */ +static int sysctl_perf_user_access __read_mostly = SYSCTL_USER_ACCESS; + /* * RISC-V doesn't have heterogeneous harts yet. This need to be part of * per_cpu in case of harts with different pmu counters @@ -301,6 +312,11 @@ int riscv_pmu_get_hpm_info(u32 *hw_ctr_width, u32 *num_hw_ctr) } EXPORT_SYMBOL_GPL(riscv_pmu_get_hpm_info); +static uint8_t pmu_sbi_csr_index(struct perf_event *event) +{ + return pmu_ctr_list[event->hw.idx].csr - CSR_CYCLE; +} + static unsigned long pmu_sbi_get_filter_flags(struct perf_event *event) { unsigned long cflags = 0; @@ -329,18 +345,34 @@ static int pmu_sbi_ctr_get_idx(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(rvpmu->hw_events); struct sbiret ret; int idx; - uint64_t cbase = 0; + uint64_t cbase = 0, cmask = rvpmu->cmask; unsigned long cflags = 0; cflags = pmu_sbi_get_filter_flags(event); + + /* + * In legacy mode, we have to force the fixed counters for those events + * but not in the user access mode as we want to use the other counters + * that support sampling/filtering. + */ + if (hwc->flags & PERF_EVENT_FLAG_LEGACY) { + if (event->attr.config == PERF_COUNT_HW_CPU_CYCLES) { + cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH; + cmask = 1; + } else if (event->attr.config == PERF_COUNT_HW_INSTRUCTIONS) { + cflags |= SBI_PMU_CFG_FLAG_SKIP_MATCH; + cmask = 1UL << (CSR_INSTRET - CSR_CYCLE); + } + } + /* retrieve the available counter index */ #if defined(CONFIG_32BIT) ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, - rvpmu->cmask, cflags, hwc->event_base, hwc->config, + cmask, cflags, hwc->event_base, hwc->config, hwc->config >> 32); #else ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_CFG_MATCH, cbase, - rvpmu->cmask, cflags, hwc->event_base, hwc->config, 0); + cmask, cflags, hwc->event_base, hwc->config, 0); #endif if (ret.error) { pr_debug("Not able to find a counter for event %lx config %llx\n", @@ -474,6 +506,22 @@ static u64 pmu_sbi_ctr_read(struct perf_event *event) return val; } +static void pmu_sbi_set_scounteren(void *arg) +{ + struct perf_event *event = (struct perf_event *)arg; + + csr_write(CSR_SCOUNTEREN, + csr_read(CSR_SCOUNTEREN) | (1 << pmu_sbi_csr_index(event))); +} + +static void pmu_sbi_reset_scounteren(void *arg) +{ + struct perf_event *event = (struct perf_event *)arg; + + csr_write(CSR_SCOUNTEREN, + csr_read(CSR_SCOUNTEREN) & ~(1 << pmu_sbi_csr_index(event))); +} + static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival) { struct sbiret ret; @@ -490,6 +538,10 @@ static void pmu_sbi_ctr_start(struct perf_event *event, u64 ival) if (ret.error && (ret.error != SBI_ERR_ALREADY_STARTED)) pr_err("Starting counter idx %d failed with error %d\n", hwc->idx, sbi_err_map_linux_errno(ret.error)); + + if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) && + (hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) + pmu_sbi_set_scounteren((void *)event); } static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag) @@ -497,6 +549,10 @@ static void pmu_sbi_ctr_stop(struct perf_event *event, unsigned long flag) struct sbiret ret; struct hw_perf_event *hwc = &event->hw; + if ((hwc->flags & PERF_EVENT_FLAG_USER_ACCESS) && + (hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) + pmu_sbi_reset_scounteren((void *)event); + ret = sbi_ecall(SBI_EXT_PMU, SBI_EXT_PMU_COUNTER_STOP, hwc->idx, 1, flag, 0, 0, 0); if (ret.error && (ret.error != SBI_ERR_ALREADY_STOPPED) && flag != SBI_PMU_STOP_FLAG_RESET) @@ -704,10 +760,13 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node) struct cpu_hw_events *cpu_hw_evt = this_cpu_ptr(pmu->hw_events); /* - * Enable the access for CYCLE, TIME, and INSTRET CSRs from userspace, - * as is necessary to maintain uABI compatibility. + * We keep enabling userspace access to CYCLE, TIME and INSTRET via the + * legacy option but that will be removed in the future. */ - csr_write(CSR_SCOUNTEREN, 0x7); + if (sysctl_perf_user_access == SYSCTL_LEGACY) + csr_write(CSR_SCOUNTEREN, 0x7); + else + csr_write(CSR_SCOUNTEREN, 0x2); /* Stop all the counters so that they can be enabled from perf */ pmu_sbi_stop_all(pmu); @@ -838,6 +897,121 @@ static void riscv_pmu_destroy(struct riscv_pmu *pmu) cpuhp_state_remove_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node); } +static void pmu_sbi_event_init(struct perf_event *event) +{ + /* + * The permissions are set at event_init so that we do not depend + * on the sysctl value that can change. + */ + if (sysctl_perf_user_access == SYSCTL_NO_USER_ACCESS) + event->hw.flags |= PERF_EVENT_FLAG_NO_USER_ACCESS; + else if (sysctl_perf_user_access == SYSCTL_USER_ACCESS) + event->hw.flags |= PERF_EVENT_FLAG_USER_ACCESS; + else + event->hw.flags |= PERF_EVENT_FLAG_LEGACY; +} + +static void pmu_sbi_event_mapped(struct perf_event *event, struct mm_struct *mm) +{ + if (event->hw.flags & PERF_EVENT_FLAG_NO_USER_ACCESS) + return; + + if (event->hw.flags & PERF_EVENT_FLAG_LEGACY) { + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES && + event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) { + return; + } + } + + /* + * The user mmapped the event to directly access it: this is where + * we determine based on sysctl_perf_user_access if we grant userspace + * the direct access to this event. That means that within the same + * task, some events may be directly accessible and some other may not, + * if the user changes the value of sysctl_perf_user_accesss in the + * meantime. + */ + + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; + + /* + * We must enable userspace access *before* advertising in the user page + * that it is possible to do so to avoid any race. + * And we must notify all cpus here because threads that currently run + * on other cpus will try to directly access the counter too without + * calling pmu_sbi_ctr_start. + */ + if (event->hw.flags & PERF_EVENT_FLAG_USER_ACCESS) + on_each_cpu_mask(mm_cpumask(mm), + pmu_sbi_set_scounteren, (void *)event, 1); +} + +static void pmu_sbi_event_unmapped(struct perf_event *event, struct mm_struct *mm) +{ + if (event->hw.flags & PERF_EVENT_FLAG_NO_USER_ACCESS) + return; + + if (event->hw.flags & PERF_EVENT_FLAG_LEGACY) { + if (event->attr.config != PERF_COUNT_HW_CPU_CYCLES && + event->attr.config != PERF_COUNT_HW_INSTRUCTIONS) { + return; + } + } + + /* + * Here we can directly remove user access since the user does not have + * access to the user page anymore so we avoid the racy window where the + * user could have read cap_user_rdpmc to true right before we disable + * it. + */ + event->hw.flags &= ~PERF_EVENT_FLAG_USER_READ_CNT; + + if (event->hw.flags & PERF_EVENT_FLAG_USER_ACCESS) + on_each_cpu_mask(mm_cpumask(mm), + pmu_sbi_reset_scounteren, (void *)event, 1); +} + +static void riscv_pmu_update_counter_access(void *info) +{ + if (sysctl_perf_user_access == SYSCTL_LEGACY) + csr_write(CSR_SCOUNTEREN, 0x7); + else + csr_write(CSR_SCOUNTEREN, 0x2); +} + +static int riscv_pmu_proc_user_access_handler(struct ctl_table *table, + int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + int prev = sysctl_perf_user_access; + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + /* + * Test against the previous value since we clear SCOUNTEREN when + * sysctl_perf_user_access is set to SYSCTL_USER_ACCESS, but we should + * not do that if that was already the case. + */ + if (ret || !write || prev == sysctl_perf_user_access) + return ret; + + on_each_cpu(riscv_pmu_update_counter_access, NULL, 1); + + return 0; +} + +static struct ctl_table sbi_pmu_sysctl_table[] = { + { + .procname = "perf_user_access", + .data = &sysctl_perf_user_access, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = riscv_pmu_proc_user_access_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_TWO, + }, + { } +}; + static int pmu_sbi_device_probe(struct platform_device *pdev) { struct riscv_pmu *pmu = NULL; @@ -881,6 +1055,10 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) pmu->ctr_get_width = pmu_sbi_ctr_get_width; pmu->ctr_clear_idx = pmu_sbi_ctr_clear_idx; pmu->ctr_read = pmu_sbi_ctr_read; + pmu->event_init = pmu_sbi_event_init; + pmu->event_mapped = pmu_sbi_event_mapped; + pmu->event_unmapped = pmu_sbi_event_unmapped; + pmu->csr_index = pmu_sbi_csr_index; ret = cpuhp_state_add_instance(CPUHP_AP_PERF_RISCV_STARTING, &pmu->node); if (ret) @@ -894,6 +1072,8 @@ static int pmu_sbi_device_probe(struct platform_device *pdev) if (ret) goto out_unregister; + register_sysctl("kernel", sbi_pmu_sysctl_table); + return 0; out_unregister: From 57972127b20ef9cb84fc214f7cfacaa7ab884a38 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:26 +0200 Subject: [PATCH 22/51] Documentation: admin-guide: Add riscv sysctl_perf_user_access riscv now uses this sysctl so document its usage for this architecture. Signed-off-by: Alexandre Ghiti --- Documentation/admin-guide/sysctl/kernel.rst | 27 ++++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3800fab1619b..8019103aac10 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -941,16 +941,35 @@ enabled, otherwise writing to this file will return ``-EBUSY``. The default value is 8. -perf_user_access (arm64 only) -================================= +perf_user_access (arm64 and riscv only) +======================================= -Controls user space access for reading perf event counters. When set to 1, -user space can read performance monitor counter registers directly. +Controls user space access for reading perf event counters. + +arm64 +===== The default value is 0 (access disabled). +When set to 1, user space can read performance monitor counter registers +directly. + See Documentation/arch/arm64/perf.rst for more information. +riscv +===== + +When set to 0, user space access is disabled. + +The default value is 1, user space can read performance monitor counter +registers through perf, any direct access without perf intervention will trigger +an illegal instruction. + +When set to 2, which enables legacy mode (user space has direct access to cycle +and insret CSRs only). Note that this legacy value is deprecated and will be +removed once all user space applications are fixed. + +Note that the time CSR is always directly accessible to all modes. pid_max ======= From 60bd50116484b84fcfcd0db55d6d821ff3a21541 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:27 +0200 Subject: [PATCH 23/51] tools: lib: perf: Implement riscv mmap support riscv now supports mmaping hardware counters so add what's needed to take advantage of that in libperf. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Reviewed-by: Ian Rogers --- tools/lib/perf/mmap.c | 66 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index 0d1634cedf44..2184814b37dd 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -392,6 +392,72 @@ static u64 read_perf_counter(unsigned int counter) static u64 read_timestamp(void) { return read_sysreg(cntvct_el0); } +/* __riscv_xlen contains the witdh of the native base integer, here 64-bit */ +#elif defined(__riscv) && __riscv_xlen == 64 + +/* TODO: implement rv32 support */ + +#define CSR_CYCLE 0xc00 +#define CSR_TIME 0xc01 + +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__ ("csrr %0, %1" \ + : "=r" (__v) \ + : "i" (csr) : ); \ + __v; \ +}) + +static unsigned long csr_read_num(int csr_num) +{ +#define switchcase_csr_read(__csr_num, __val) {\ + case __csr_num: \ + __val = csr_read(__csr_num); \ + break; } +#define switchcase_csr_read_2(__csr_num, __val) {\ + switchcase_csr_read(__csr_num + 0, __val) \ + switchcase_csr_read(__csr_num + 1, __val)} +#define switchcase_csr_read_4(__csr_num, __val) {\ + switchcase_csr_read_2(__csr_num + 0, __val) \ + switchcase_csr_read_2(__csr_num + 2, __val)} +#define switchcase_csr_read_8(__csr_num, __val) {\ + switchcase_csr_read_4(__csr_num + 0, __val) \ + switchcase_csr_read_4(__csr_num + 4, __val)} +#define switchcase_csr_read_16(__csr_num, __val) {\ + switchcase_csr_read_8(__csr_num + 0, __val) \ + switchcase_csr_read_8(__csr_num + 8, __val)} +#define switchcase_csr_read_32(__csr_num, __val) {\ + switchcase_csr_read_16(__csr_num + 0, __val) \ + switchcase_csr_read_16(__csr_num + 16, __val)} + + unsigned long ret = 0; + + switch (csr_num) { + switchcase_csr_read_32(CSR_CYCLE, ret) + default: + break; + } + + return ret; +#undef switchcase_csr_read_32 +#undef switchcase_csr_read_16 +#undef switchcase_csr_read_8 +#undef switchcase_csr_read_4 +#undef switchcase_csr_read_2 +#undef switchcase_csr_read +} + +static u64 read_perf_counter(unsigned int counter) +{ + return csr_read_num(CSR_CYCLE + counter); +} + +static u64 read_timestamp(void) +{ + return csr_read_num(CSR_TIME); +} + #else static u64 read_perf_counter(unsigned int counter __maybe_unused) { return 0; } static u64 read_timestamp(void) { return 0; } From 26ba042414a35cb1fd7c31fae63841956ce7cecb Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 2 Aug 2023 10:03:28 +0200 Subject: [PATCH 24/51] perf: tests: Adapt mmap-basic.c for riscv riscv now supports mmaping hardware counters to userspace so adapt the test to run on this architecture. Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Reviewed-by: Ian Rogers --- tools/perf/tests/mmap-basic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/mmap-basic.c b/tools/perf/tests/mmap-basic.c index e68ca6229756..886a13a77a16 100644 --- a/tools/perf/tests/mmap-basic.c +++ b/tools/perf/tests/mmap-basic.c @@ -284,7 +284,8 @@ static struct test_case tests__basic_mmap[] = { "permissions"), TEST_CASE_REASON("User space counter reading of instructions", mmap_user_read_instr, -#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) || \ + (defined(__riscv) && __riscv_xlen == 64) "permissions" #else "unsupported" @@ -292,7 +293,8 @@ static struct test_case tests__basic_mmap[] = { ), TEST_CASE_REASON("User space counter reading of cycles", mmap_user_read_cycles, -#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) +#if defined(__i386__) || defined(__x86_64__) || defined(__aarch64__) || \ + (defined(__riscv) && __riscv_xlen == 64) "permissions" #else "unsupported" From a93892974f2e46bed59b366133f3ef60d70d3b66 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 30 Jul 2023 10:27:07 +0200 Subject: [PATCH 25/51] riscv: kprobes: simulate c.j instruction kprobes currently rejects c.j instruction. Implement it. Signed-off-by: Nam Cao Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/6ef76cd9984b8015826649d13f870f8ac45a2d0d.1690704360.git.namcaov@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/decode-insn.c | 3 ++- arch/riscv/kernel/probes/simulate-insn.c | 24 ++++++++++++++++++++++++ arch/riscv/kernel/probes/simulate-insn.h | 1 + 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/probes/decode-insn.c b/arch/riscv/kernel/probes/decode-insn.c index 64f6183b4717..39adb07a342d 100644 --- a/arch/riscv/kernel/probes/decode-insn.c +++ b/arch/riscv/kernel/probes/decode-insn.c @@ -29,13 +29,14 @@ riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *api) * TODO: the REJECTED ones below need to be implemented */ #ifdef CONFIG_RISCV_ISA_C - RISCV_INSN_REJECTED(c_j, insn); RISCV_INSN_REJECTED(c_jr, insn); RISCV_INSN_REJECTED(c_jal, insn); RISCV_INSN_REJECTED(c_jalr, insn); RISCV_INSN_REJECTED(c_beqz, insn); RISCV_INSN_REJECTED(c_bnez, insn); RISCV_INSN_REJECTED(c_ebreak, insn); + + RISCV_INSN_SET_SIMULATE(c_j, insn); #endif RISCV_INSN_SET_SIMULATE(jal, insn); diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 7441ac8a6843..3ba45c612cd8 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -188,3 +188,27 @@ bool __kprobes simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *r return true; } + +bool __kprobes simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + /* + * 15 13 12 2 1 0 + * | funct3 | offset[11|4|9:8|10|6|7|3:1|5] | opcode | + * 3 11 2 + */ + + s32 offset; + + offset = ((opcode >> 3) & 0x7) << 1; + offset |= ((opcode >> 11) & 0x1) << 4; + offset |= ((opcode >> 2) & 0x1) << 5; + offset |= ((opcode >> 7) & 0x1) << 6; + offset |= ((opcode >> 6) & 0x1) << 7; + offset |= ((opcode >> 9) & 0x3) << 8; + offset |= ((opcode >> 8) & 0x1) << 10; + offset |= ((opcode >> 12) & 0x1) << 11; + + instruction_pointer_set(regs, addr + sign_extend32(offset, 11)); + + return true; +} diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h index 61e35db31001..4bd6c266e7d3 100644 --- a/arch/riscv/kernel/probes/simulate-insn.h +++ b/arch/riscv/kernel/probes/simulate-insn.h @@ -24,5 +24,6 @@ bool simulate_auipc(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs); #endif /* _RISCV_KERNEL_PROBES_SIMULATE_INSN_H */ From b18256d9b744497410bc124a94a546c5eef579eb Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 30 Jul 2023 10:27:08 +0200 Subject: [PATCH 26/51] riscv: kprobes: simulate c.jr and c.jalr instructions kprobes currently rejects c.jr and c.jalr instructions. Implement them. Signed-off-by: Nam Cao Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/db8b7787e9208654cca50484f68334f412be2ea9.1690704360.git.namcaov@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/decode-insn.c | 4 +-- arch/riscv/kernel/probes/simulate-insn.c | 37 ++++++++++++++++++++++++ arch/riscv/kernel/probes/simulate-insn.h | 2 ++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/probes/decode-insn.c b/arch/riscv/kernel/probes/decode-insn.c index 39adb07a342d..6dba23a55ac7 100644 --- a/arch/riscv/kernel/probes/decode-insn.c +++ b/arch/riscv/kernel/probes/decode-insn.c @@ -29,14 +29,14 @@ riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *api) * TODO: the REJECTED ones below need to be implemented */ #ifdef CONFIG_RISCV_ISA_C - RISCV_INSN_REJECTED(c_jr, insn); RISCV_INSN_REJECTED(c_jal, insn); - RISCV_INSN_REJECTED(c_jalr, insn); RISCV_INSN_REJECTED(c_beqz, insn); RISCV_INSN_REJECTED(c_bnez, insn); RISCV_INSN_REJECTED(c_ebreak, insn); RISCV_INSN_SET_SIMULATE(c_j, insn); + RISCV_INSN_SET_SIMULATE(c_jr, insn); + RISCV_INSN_SET_SIMULATE(c_jalr, insn); #endif RISCV_INSN_SET_SIMULATE(jal, insn); diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 3ba45c612cd8..1ead6f4951f9 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -212,3 +212,40 @@ bool __kprobes simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs return true; } + +static bool __kprobes simulate_c_jr_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs, + bool is_jalr) +{ + /* + * 15 12 11 7 6 2 1 0 + * | funct4 | rs1 | rs2 | op | + * 4 5 5 2 + */ + + unsigned long jump_addr; + + u32 rs1 = (opcode >> 7) & 0x1f; + + if (rs1 == 0) /* C.JR is only valid when rs1 != x0 */ + return false; + + if (!rv_insn_reg_get_val(regs, rs1, &jump_addr)) + return false; + + if (is_jalr && !rv_insn_reg_set_val(regs, 1, addr + 2)) + return false; + + instruction_pointer_set(regs, jump_addr); + + return true; +} + +bool __kprobes simulate_c_jr(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_jr_jalr(opcode, addr, regs, false); +} + +bool __kprobes simulate_c_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_jr_jalr(opcode, addr, regs, true); +} diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h index 4bd6c266e7d3..472a1948ec4f 100644 --- a/arch/riscv/kernel/probes/simulate-insn.h +++ b/arch/riscv/kernel/probes/simulate-insn.h @@ -25,5 +25,7 @@ bool simulate_branch(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_jal(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_jr(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); #endif /* _RISCV_KERNEL_PROBES_SIMULATE_INSN_H */ From d943705fba3af1dec5a999cb3739949710a1aa90 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Sun, 30 Jul 2023 10:27:09 +0200 Subject: [PATCH 27/51] riscv: kprobes: simulate c.beqz and c.bnez kprobes currently rejects instruction c.beqz and c.bnez. Implement them. Signed-off-by: Nam Cao Reviewed-by: Charlie Jenkins Link: https://lore.kernel.org/r/1d879dba4e4ee9a82e27625d6483b5c9cfed684f.1690704360.git.namcaov@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/decode-insn.c | 4 +-- arch/riscv/kernel/probes/simulate-insn.c | 44 ++++++++++++++++++++++++ arch/riscv/kernel/probes/simulate-insn.h | 2 ++ 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kernel/probes/decode-insn.c b/arch/riscv/kernel/probes/decode-insn.c index 6dba23a55ac7..65d9590bfb9f 100644 --- a/arch/riscv/kernel/probes/decode-insn.c +++ b/arch/riscv/kernel/probes/decode-insn.c @@ -30,13 +30,13 @@ riscv_probe_decode_insn(probe_opcode_t *addr, struct arch_probe_insn *api) */ #ifdef CONFIG_RISCV_ISA_C RISCV_INSN_REJECTED(c_jal, insn); - RISCV_INSN_REJECTED(c_beqz, insn); - RISCV_INSN_REJECTED(c_bnez, insn); RISCV_INSN_REJECTED(c_ebreak, insn); RISCV_INSN_SET_SIMULATE(c_j, insn); RISCV_INSN_SET_SIMULATE(c_jr, insn); RISCV_INSN_SET_SIMULATE(c_jalr, insn); + RISCV_INSN_SET_SIMULATE(c_beqz, insn); + RISCV_INSN_SET_SIMULATE(c_bnez, insn); #endif RISCV_INSN_SET_SIMULATE(jal, insn); diff --git a/arch/riscv/kernel/probes/simulate-insn.c b/arch/riscv/kernel/probes/simulate-insn.c index 1ead6f4951f9..d3099d67816d 100644 --- a/arch/riscv/kernel/probes/simulate-insn.c +++ b/arch/riscv/kernel/probes/simulate-insn.c @@ -249,3 +249,47 @@ bool __kprobes simulate_c_jalr(u32 opcode, unsigned long addr, struct pt_regs *r { return simulate_c_jr_jalr(opcode, addr, regs, true); } + +static bool __kprobes simulate_c_bnez_beqz(u32 opcode, unsigned long addr, struct pt_regs *regs, + bool is_bnez) +{ + /* + * 15 13 12 10 9 7 6 2 1 0 + * | funct3 | offset[8|4:3] | rs1' | offset[7:6|2:1|5] | op | + * 3 3 3 5 2 + */ + + s32 offset; + u32 rs1; + unsigned long rs1_val; + + rs1 = 0x8 | ((opcode >> 7) & 0x7); + + if (!rv_insn_reg_get_val(regs, rs1, &rs1_val)) + return false; + + if ((rs1_val != 0 && is_bnez) || (rs1_val == 0 && !is_bnez)) { + offset = ((opcode >> 3) & 0x3) << 1; + offset |= ((opcode >> 10) & 0x3) << 3; + offset |= ((opcode >> 2) & 0x1) << 5; + offset |= ((opcode >> 5) & 0x3) << 6; + offset |= ((opcode >> 12) & 0x1) << 8; + offset = sign_extend32(offset, 8); + } else { + offset = 2; + } + + instruction_pointer_set(regs, addr + offset); + + return true; +} + +bool __kprobes simulate_c_bnez(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_bnez_beqz(opcode, addr, regs, true); +} + +bool __kprobes simulate_c_beqz(u32 opcode, unsigned long addr, struct pt_regs *regs) +{ + return simulate_c_bnez_beqz(opcode, addr, regs, false); +} diff --git a/arch/riscv/kernel/probes/simulate-insn.h b/arch/riscv/kernel/probes/simulate-insn.h index 472a1948ec4f..44ebbc444db9 100644 --- a/arch/riscv/kernel/probes/simulate-insn.h +++ b/arch/riscv/kernel/probes/simulate-insn.h @@ -27,5 +27,7 @@ bool simulate_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_c_j(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_c_jr(u32 opcode, unsigned long addr, struct pt_regs *regs); bool simulate_c_jalr(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_bnez(u32 opcode, unsigned long addr, struct pt_regs *regs); +bool simulate_c_beqz(u32 opcode, unsigned long addr, struct pt_regs *regs); #endif /* _RISCV_KERNEL_PROBES_SIMULATE_INSN_H */ From 5882e5acf18d79d586282acfd07a8c88550e2cee Mon Sep 17 00:00:00 2001 From: Chen Jiahao Date: Wed, 26 Jul 2023 17:49:59 +0000 Subject: [PATCH 28/51] riscv: kdump: Implement crashkernel=X,[high,low] On riscv, the current crash kernel allocation logic is trying to allocate within 32bit addressible memory region by default, if failed, try to allocate without 4G restriction. In need of saving DMA zone memory while allocating a relatively large crash kernel region, allocating the reserved memory top down in high memory, without overlapping the DMA zone, is a mature solution. Here introduce the parameter option crashkernel=X,[high,low]. One can reserve the crash kernel from high memory above DMA zone range by explicitly passing "crashkernel=X,high"; or reserve a memory range below 4G with "crashkernel=X,low". Signed-off-by: Chen Jiahao Acked-by: Guo Ren Acked-by: Baoquan He Link: https://lore.kernel.org/r/20230726175000.2536220-2-chenjiahao16@huawei.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 5 +++ arch/riscv/mm/init.c | 93 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 91 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 971fe776e2f8..376f5d49ce85 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -178,6 +178,11 @@ static void __init init_resources(void) if (ret < 0) goto error; } + if (crashk_low_res.start != crashk_low_res.end) { + ret = add_resource(&iomem_resource, &crashk_low_res); + if (ret < 0) + goto error; + } #endif #ifdef CONFIG_CRASH_DUMP diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 70fb31960b63..156e09a1ceee 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1297,6 +1297,28 @@ static inline void setup_vm_final(void) } #endif /* CONFIG_MMU */ +/* Reserve 128M low memory by default for swiotlb buffer */ +#define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) + +static int __init reserve_crashkernel_low(unsigned long long low_size) +{ + unsigned long long low_base; + + low_base = memblock_phys_alloc_range(low_size, PMD_SIZE, 0, dma32_phys_limit); + if (!low_base) { + pr_err("cannot allocate crashkernel low memory (size:0x%llx).\n", low_size); + return -ENOMEM; + } + + pr_info("crashkernel low memory reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + low_base, low_base + low_size, low_size >> 20); + + crashk_low_res.start = low_base; + crashk_low_res.end = low_base + low_size - 1; + + return 0; +} + /* * reserve_crashkernel() - reserves memory for crash kernel * @@ -1308,8 +1330,12 @@ static void __init reserve_crashkernel(void) { unsigned long long crash_base = 0; unsigned long long crash_size = 0; + unsigned long long crash_low_size = 0; unsigned long search_start = memblock_start_of_DRAM(); - unsigned long search_end = memblock_end_of_DRAM(); + unsigned long search_end = (unsigned long)dma32_phys_limit; + char *cmdline = boot_command_line; + bool fixed_base = false; + bool high = false; int ret = 0; @@ -1325,14 +1351,36 @@ static void __init reserve_crashkernel(void) return; } - ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + ret = parse_crashkernel(cmdline, memblock_phys_mem_size(), &crash_size, &crash_base); - if (ret || !crash_size) + if (ret == -ENOENT) { + /* Fallback to crashkernel=X,[high,low] */ + ret = parse_crashkernel_high(cmdline, 0, &crash_size, &crash_base); + if (ret || !crash_size) + return; + + /* + * crashkernel=Y,low is valid only when crashkernel=X,high + * is passed. + */ + ret = parse_crashkernel_low(cmdline, 0, &crash_low_size, &crash_base); + if (ret == -ENOENT) + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + else if (ret) + return; + + search_start = (unsigned long)dma32_phys_limit; + search_end = memblock_end_of_DRAM(); + high = true; + } else if (ret || !crash_size) { + /* Invalid argument value specified */ return; + } crash_size = PAGE_ALIGN(crash_size); if (crash_base) { + fixed_base = true; search_start = crash_base; search_end = crash_base + crash_size; } @@ -1345,12 +1393,37 @@ static void __init reserve_crashkernel(void) * swiotlb can work on the crash kernel. */ crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, - min(search_end, (unsigned long) SZ_4G)); + search_start, search_end); if (crash_base == 0) { - /* Try again without restricting region to 32bit addressible memory */ + /* + * For crashkernel=size[KMG]@offset[KMG], print out failure + * message if can't reserve the specified region. + */ + if (fixed_base) { + pr_warn("crashkernel: allocating failed with given size@offset\n"); + return; + } + + if (high) { + /* + * For crashkernel=size[KMG],high, if the first attempt was + * for high memory, fall back to low memory. + */ + search_start = memblock_start_of_DRAM(); + search_end = (unsigned long)dma32_phys_limit; + } else { + /* + * For crashkernel=size[KMG], if the first attempt was for + * low memory, fall back to high memory, the minimum required + * low memory will be reserved later. + */ + search_start = (unsigned long)dma32_phys_limit; + search_end = memblock_end_of_DRAM(); + crash_low_size = DEFAULT_CRASH_KERNEL_LOW_SIZE; + } + crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, - search_start, search_end); + search_start, search_end); if (crash_base == 0) { pr_warn("crashkernel: couldn't allocate %lldKB\n", crash_size >> 10); @@ -1358,6 +1431,12 @@ static void __init reserve_crashkernel(void) } } + if ((crash_base >= dma32_phys_limit) && crash_low_size && + reserve_crashkernel_low(crash_low_size)) { + memblock_phys_free(crash_base, crash_size); + return; + } + pr_info("crashkernel: reserved 0x%016llx - 0x%016llx (%lld MB)\n", crash_base, crash_base + crash_size, crash_size >> 20); From 33f0dd973d4e7d3ed208b5df027490380d5876ab Mon Sep 17 00:00:00 2001 From: Chen Jiahao Date: Wed, 26 Jul 2023 17:50:00 +0000 Subject: [PATCH 29/51] docs: kdump: Update the crashkernel description for riscv Now "crashkernel=" parameter on riscv has been updated to support crashkernel=X,[high,low]. Through which we can reserve memory region above/within 32bit addressible DMA zone. Here update the parameter description accordingly. Signed-off-by: Chen Jiahao Reviewed-by: Guo Ren Reviewed-by: Simon Horman Reviewed-by: Zhen Lei Acked-by: Baoquan He Link: https://lore.kernel.org/r/20230726175000.2536220-3-chenjiahao16@huawei.com Signed-off-by: Palmer Dabbelt --- Documentation/admin-guide/kernel-parameters.txt | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a1457995fd41..bd519749968f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -862,7 +862,7 @@ memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset is selected automatically. - [KNL, X86-64, ARM64] Select a region under 4G first, and + [KNL, X86-64, ARM64, RISCV] Select a region under 4G first, and fall back to reserve region above 4G when '@offset' hasn't been specified. See Documentation/admin-guide/kdump/kdump.rst for further details. @@ -875,14 +875,14 @@ Documentation/admin-guide/kdump/kdump.rst for an example. crashkernel=size[KMG],high - [KNL, X86-64, ARM64] range could be above 4G. Allow kernel - to allocate physical memory region from top, so could - be above 4G if system have more than 4G ram installed. - Otherwise memory region will be allocated below 4G, if - available. + [KNL, X86-64, ARM64, RISCV] range could be above 4G. + Allow kernel to allocate physical memory region from top, + so could be above 4G if system have more than 4G ram + installed. Otherwise memory region will be allocated + below 4G, if available. It will be ignored if crashkernel=X is specified. crashkernel=size[KMG],low - [KNL, X86-64, ARM64] range under 4G. When crashkernel=X,high + [KNL, X86-64, ARM64, RISCV] range under 4G. When crashkernel=X,high is passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb @@ -893,6 +893,7 @@ size is platform dependent. --> x86: max(swiotlb_size_or_default() + 8MiB, 256MiB) --> arm64: 128MiB + --> riscv: 128MiB This one lets the user specify own low range under 4G for second kernel instead. 0: to disable low allocation. From 9f944d2e0ab39296bfadb29167dc333815ba9f48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Wed, 23 Aug 2023 10:28:45 +0200 Subject: [PATCH 30/51] riscv: Require FRAME_POINTER for some configurations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some V configurations implicitly turn on '-fno-omit-frame-pointer', but leaving FRAME_POINTER disabled. This makes it hard to reason about the FRAME_POINTER config, and also triggers build failures introduced in by the commit in the Fixes: tag. Select FRAME_POINTER explicitly for these configurations. Fixes: ebc9cb03b21e ("riscv: stack: Fixup independent softirq stack for CONFIG_FRAME_POINTER=n") Signed-off-by: Björn Töpel Tested-by: Randy Dunlap Acked-by: Randy Dunlap Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230823082845.354839-1-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/Makefile | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index f52dd125ac5e..afa7160b136c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -62,6 +62,7 @@ config RISCV select COMMON_CLK select CPU_PM if CPU_IDLE || HIBERNATION select EDAC_SUPPORT + select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE) select GENERIC_ARCH_TOPOLOGY select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS_BROADCAST if SMP diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 6ec6d52a4180..1329e060c548 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -87,9 +87,6 @@ endif ifeq ($(CONFIG_CMODEL_MEDANY),y) KBUILD_CFLAGS += -mcmodel=medany endif -ifeq ($(CONFIG_PERF_EVENTS),y) - KBUILD_CFLAGS += -fno-omit-frame-pointer -endif # Avoid generating .eh_frame sections. KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables From 08d0ce30e0e4fcb5f06c90fe40387b1ce9324833 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 10 Jul 2023 18:35:46 +0000 Subject: [PATCH 31/51] riscv: Implement syscall wrappers Commit f0bddf50586d ("riscv: entry: Convert to generic entry") moved syscall handling to C code, which exposed function pointer type mismatches that trip fine-grained forward-edge Control-Flow Integrity (CFI) checks as syscall handlers are all called through the same syscall_t pointer type. To fix the type mismatches, implement pt_regs based syscall wrappers similarly to x86 and arm64. This patch is based on arm64 syscall wrappers added in commit 4378a7d4be30 ("arm64: implement syscall wrappers"), where the main goal was to minimize the risk of userspace-controlled values being used under speculation. This may be a concern for riscv in future as well. Following other architectures, the syscall wrappers generate three functions for each syscall; __riscv_sys_ takes a pt_regs pointer and extracts arguments from registers, __se_sys_ is a sign-extension wrapper that casts the long arguments to the correct types for the real syscall implementation, which is named __do_sys_. Reviewed-by: Kees Cook Tested-by: Nathan Chancellor Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230710183544.999540-9-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/syscall.h | 5 +- arch/riscv/include/asm/syscall_wrapper.h | 87 ++++++++++++++++++++++++ arch/riscv/kernel/compat_syscall_table.c | 8 ++- arch/riscv/kernel/sys_riscv.c | 6 ++ arch/riscv/kernel/syscall_table.c | 8 ++- 6 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 arch/riscv/include/asm/syscall_wrapper.h diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..a475ef1a0c1c 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -35,6 +35,7 @@ config RISCV select ARCH_HAS_SET_MEMORY if MMU select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL + select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_VDSO_DATA diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index 0148c6bd9675..121fff429dce 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -75,7 +75,7 @@ static inline int syscall_get_arch(struct task_struct *task) #endif } -typedef long (*syscall_t)(ulong, ulong, ulong, ulong, ulong, ulong, ulong); +typedef long (*syscall_t)(const struct pt_regs *); static inline void syscall_handler(struct pt_regs *regs, ulong syscall) { syscall_t fn; @@ -87,8 +87,7 @@ static inline void syscall_handler(struct pt_regs *regs, ulong syscall) #endif fn = sys_call_table[syscall]; - regs->a0 = fn(regs->orig_a0, regs->a1, regs->a2, - regs->a3, regs->a4, regs->a5, regs->a6); + regs->a0 = fn(regs); } static inline bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) diff --git a/arch/riscv/include/asm/syscall_wrapper.h b/arch/riscv/include/asm/syscall_wrapper.h new file mode 100644 index 000000000000..1d7942c8a6cb --- /dev/null +++ b/arch/riscv/include/asm/syscall_wrapper.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * syscall_wrapper.h - riscv specific wrappers to syscall definitions + * + * Based on arch/arm64/include/syscall_wrapper.h + */ + +#ifndef __ASM_SYSCALL_WRAPPER_H +#define __ASM_SYSCALL_WRAPPER_H + +#include + +asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *); + +#define SC_RISCV_REGS_TO_ARGS(x, ...) \ + __MAP(x,__SC_ARGS \ + ,,regs->orig_a0,,regs->a1,,regs->a2 \ + ,,regs->a3,,regs->a4,,regs->a5,,regs->a6) + +#ifdef CONFIG_COMPAT + +#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long __riscv_compat_sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__riscv_compat_sys##name, ERRNO); \ + static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + asmlinkage long __riscv_compat_sys##name(const struct pt_regs *regs) \ + { \ + return __se_compat_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \ + } \ + static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + return __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ + } \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +#define COMPAT_SYSCALL_DEFINE0(sname) \ + asmlinkage long __riscv_compat_sys_##sname(const struct pt_regs *__unused); \ + ALLOW_ERROR_INJECTION(__riscv_compat_sys_##sname, ERRNO); \ + asmlinkage long __riscv_compat_sys_##sname(const struct pt_regs *__unused) + +#define COND_SYSCALL_COMPAT(name) \ + asmlinkage long __weak __riscv_compat_sys_##name(const struct pt_regs *regs); \ + asmlinkage long __weak __riscv_compat_sys_##name(const struct pt_regs *regs) \ + { \ + return sys_ni_syscall(); \ + } + +#define COMPAT_SYS_NI(name) \ + SYSCALL_ALIAS(__riscv_compat_sys_##name, sys_ni_posix_timers); + +#endif /* CONFIG_COMPAT */ + +#define __SYSCALL_DEFINEx(x, name, ...) \ + asmlinkage long __riscv_sys##name(const struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__riscv_sys##name, ERRNO); \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + asmlinkage long __riscv_sys##name(const struct pt_regs *regs) \ + { \ + return __se_sys##name(SC_RISCV_REGS_TO_ARGS(x,__VA_ARGS__)); \ + } \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __riscv_sys_##sname(const struct pt_regs *__unused); \ + ALLOW_ERROR_INJECTION(__riscv_sys_##sname, ERRNO); \ + asmlinkage long __riscv_sys_##sname(const struct pt_regs *__unused) + +#define COND_SYSCALL(name) \ + asmlinkage long __weak __riscv_sys_##name(const struct pt_regs *regs); \ + asmlinkage long __weak __riscv_sys_##name(const struct pt_regs *regs) \ + { \ + return sys_ni_syscall(); \ + } + +#define SYS_NI(name) SYSCALL_ALIAS(__riscv_sys_##name, sys_ni_posix_timers); + +#endif /* __ASM_SYSCALL_WRAPPER_H */ diff --git a/arch/riscv/kernel/compat_syscall_table.c b/arch/riscv/kernel/compat_syscall_table.c index 651f2b009c28..ad7f2d712f5f 100644 --- a/arch/riscv/kernel/compat_syscall_table.c +++ b/arch/riscv/kernel/compat_syscall_table.c @@ -9,11 +9,15 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (call), +#define __SYSCALL(nr, call) asmlinkage long __riscv_##call(const struct pt_regs *); +#include + +#undef __SYSCALL +#define __SYSCALL(nr, call) [nr] = __riscv_##call, asmlinkage long compat_sys_rt_sigreturn(void); void * const compat_sys_call_table[__NR_syscalls] = { - [0 ... __NR_syscalls - 1] = sys_ni_syscall, + [0 ... __NR_syscalls - 1] = __riscv_sys_ni_syscall, #include }; diff --git a/arch/riscv/kernel/sys_riscv.c b/arch/riscv/kernel/sys_riscv.c index 26ef5526bfb4..473159b5f303 100644 --- a/arch/riscv/kernel/sys_riscv.c +++ b/arch/riscv/kernel/sys_riscv.c @@ -335,3 +335,9 @@ SYSCALL_DEFINE5(riscv_hwprobe, struct riscv_hwprobe __user *, pairs, return do_riscv_hwprobe(pairs, pair_count, cpu_count, cpus, flags); } + +/* Not defined using SYSCALL_DEFINE0 to avoid error injection */ +asmlinkage long __riscv_sys_ni_syscall(const struct pt_regs *__unused) +{ + return -ENOSYS; +} diff --git a/arch/riscv/kernel/syscall_table.c b/arch/riscv/kernel/syscall_table.c index 44b1420a2270..dda913764903 100644 --- a/arch/riscv/kernel/syscall_table.c +++ b/arch/riscv/kernel/syscall_table.c @@ -10,9 +10,13 @@ #include #undef __SYSCALL -#define __SYSCALL(nr, call) [nr] = (call), +#define __SYSCALL(nr, call) asmlinkage long __riscv_##call(const struct pt_regs *); +#include + +#undef __SYSCALL +#define __SYSCALL(nr, call) [nr] = __riscv_##call, void * const sys_call_table[__NR_syscalls] = { - [0 ... __NR_syscalls - 1] = sys_ni_syscall, + [0 ... __NR_syscalls - 1] = __riscv_sys_ni_syscall, #include }; From 5f59c6855bad1809a4f85ce4db412f9ede45a4a0 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 10 Jul 2023 18:35:47 +0000 Subject: [PATCH 32/51] riscv: Add types to indirectly called assembly functions With CONFIG_CFI_CLANG, assembly functions indirectly called from C code must be annotated with type identifiers to pass CFI checking. Use the SYM_TYPED_START macro to add types to the relevant functions. Reviewed-by: Kees Cook Tested-by: Nathan Chancellor Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230710183544.999540-10-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/mcount.S | 5 +++-- arch/riscv/kernel/suspend_entry.S | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 8a6e5a9e842a..6c9469050f4c 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -47,13 +48,13 @@ addi sp, sp, 4*SZREG .endm -ENTRY(ftrace_stub) +SYM_TYPED_FUNC_START(ftrace_stub) #ifdef CONFIG_DYNAMIC_FTRACE .global MCOUNT_NAME .set MCOUNT_NAME, ftrace_stub #endif ret -ENDPROC(ftrace_stub) +SYM_FUNC_END(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(return_to_handler) diff --git a/arch/riscv/kernel/suspend_entry.S b/arch/riscv/kernel/suspend_entry.S index 12b52afe09a4..f7960c7c5f9e 100644 --- a/arch/riscv/kernel/suspend_entry.S +++ b/arch/riscv/kernel/suspend_entry.S @@ -5,6 +5,7 @@ */ #include +#include #include #include #include @@ -58,7 +59,7 @@ ENTRY(__cpu_suspend_enter) ret END(__cpu_suspend_enter) -ENTRY(__cpu_resume_enter) +SYM_TYPED_FUNC_START(__cpu_resume_enter) /* Load the global pointer */ .option push .option norelax @@ -94,4 +95,4 @@ ENTRY(__cpu_resume_enter) /* Return to C code */ ret -END(__cpu_resume_enter) +SYM_FUNC_END(__cpu_resume_enter) From f3a0c23f2539a69792df4586485225fda5fab988 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 10 Jul 2023 18:35:48 +0000 Subject: [PATCH 33/51] riscv: Add ftrace_stub_graph Commit 883bbbffa5a4 ("ftrace,kcfi: Separate ftrace_stub() and ftrace_stub_graph()") added a separate ftrace_stub_graph function for CFI_CLANG. Add the stub to fix FUNCTION_GRAPH_TRACER compatibility with CFI. Reviewed-by: Kees Cook Tested-by: Nathan Chancellor Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230710183544.999540-11-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/mcount.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kernel/mcount.S b/arch/riscv/kernel/mcount.S index 6c9469050f4c..8818a8fa9ff3 100644 --- a/arch/riscv/kernel/mcount.S +++ b/arch/riscv/kernel/mcount.S @@ -57,6 +57,10 @@ SYM_TYPED_FUNC_START(ftrace_stub) SYM_FUNC_END(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER +SYM_TYPED_FUNC_START(ftrace_stub_graph) + ret +SYM_FUNC_END(ftrace_stub_graph) + ENTRY(return_to_handler) /* * On implementing the frame point test, the ideal way is to compare the From af0ead42f69389cd4ed68e1a4c6cde45c0adb35c Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 10 Jul 2023 18:35:49 +0000 Subject: [PATCH 34/51] riscv: Add CFI error handling With CONFIG_CFI_CLANG, the compiler injects a type preamble immediately before each function and a check to validate the target function type before indirect calls: ; type preamble .word function: ... ; indirect call check lw t1, -4(a0) lui t2, addiw t2, t2, beq t1, t2, .Ltmp0 ebreak .Ltmp0: jarl a0 Implement error handling code for the ebreak traps emitted for the checks. This produces the following oops on a CFI failure (generated using lkdtm): [ 21.177245] CFI failure at lkdtm_indirect_call+0x22/0x32 [lkdtm] (target: lkdtm_increment_int+0x0/0x18 [lkdtm]; expected type: 0x3ad55aca) [ 21.178483] Kernel BUG [#1] [ 21.178671] Modules linked in: lkdtm [ 21.179037] CPU: 1 PID: 104 Comm: sh Not tainted 6.3.0-rc6-00037-g37d5ec6297ab #1 [ 21.179511] Hardware name: riscv-virtio,qemu (DT) [ 21.179818] epc : lkdtm_indirect_call+0x22/0x32 [lkdtm] [ 21.180106] ra : lkdtm_CFI_FORWARD_PROTO+0x48/0x7c [lkdtm] [ 21.180426] epc : ffffffff01387092 ra : ffffffff01386f14 sp : ff20000000453cf0 [ 21.180792] gp : ffffffff81308c38 tp : ff6000000243f080 t0 : ff20000000453b78 [ 21.181157] t1 : 000000003ad55aca t2 : 000000007e0c52a5 s0 : ff20000000453d00 [ 21.181506] s1 : 0000000000000001 a0 : ffffffff0138d170 a1 : ffffffff013870bc [ 21.181819] a2 : b5fea48dd89aa700 a3 : 0000000000000001 a4 : 0000000000000fff [ 21.182169] a5 : 0000000000000004 a6 : 00000000000000b7 a7 : 0000000000000000 [ 21.182591] s2 : ff20000000453e78 s3 : ffffffffffffffea s4 : 0000000000000012 [ 21.183001] s5 : ff600000023c7000 s6 : 0000000000000006 s7 : ffffffff013882a0 [ 21.183653] s8 : 0000000000000008 s9 : 0000000000000002 s10: ffffffff0138d878 [ 21.184245] s11: ffffffff0138d878 t3 : 0000000000000003 t4 : 0000000000000000 [ 21.184591] t5 : ffffffff8133df08 t6 : ffffffff8133df07 [ 21.184858] status: 0000000000000120 badaddr: 0000000000000000 cause: 0000000000000003 [ 21.185415] [] lkdtm_indirect_call+0x22/0x32 [lkdtm] [ 21.185772] [] lkdtm_CFI_FORWARD_PROTO+0x48/0x7c [lkdtm] [ 21.186093] [] lkdtm_do_action+0x22/0x34 [lkdtm] [ 21.186445] [] direct_entry+0x128/0x13a [lkdtm] [ 21.186817] [] full_proxy_write+0x58/0xb2 [ 21.187352] [] vfs_write+0x14c/0x33a [ 21.187644] [] ksys_write+0x64/0xd4 [ 21.187832] [] sys_write+0xe/0x1a [ 21.188171] [] ret_from_syscall+0x0/0x2 [ 21.188595] Code: 0513 0f65 a303 ffc5 53b7 7e0c 839b 2a53 0363 0073 (9002) 9582 [ 21.189178] ---[ end trace 0000000000000000 ]--- [ 21.189590] Kernel panic - not syncing: Fatal exception Reviewed-by: Kees Cook Reviewed-by: Conor Dooley # ISA bits Tested-by: Nathan Chancellor Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230710183544.999540-12-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/cfi.h | 22 ++++++++++ arch/riscv/include/asm/insn.h | 10 +++++ arch/riscv/kernel/Makefile | 2 + arch/riscv/kernel/cfi.c | 77 +++++++++++++++++++++++++++++++++++ arch/riscv/kernel/traps.c | 4 +- 6 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/include/asm/cfi.h create mode 100644 arch/riscv/kernel/cfi.c diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a475ef1a0c1c..29fdba9d8514 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -49,6 +49,7 @@ config RISCV select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS + select ARCH_USES_CFI_TRAPS if CFI_CLANG select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT diff --git a/arch/riscv/include/asm/cfi.h b/arch/riscv/include/asm/cfi.h new file mode 100644 index 000000000000..56bf9d69d5e3 --- /dev/null +++ b/arch/riscv/include/asm/cfi.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_CFI_H +#define _ASM_RISCV_CFI_H + +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2023 Google LLC + */ + +#include + +#ifdef CONFIG_CFI_CLANG +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs); +#else +static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + return BUG_TRAP_TYPE_NONE; +} +#endif /* CONFIG_CFI_CLANG */ + +#endif /* _ASM_RISCV_CFI_H */ diff --git a/arch/riscv/include/asm/insn.h b/arch/riscv/include/asm/insn.h index 4e1505cef8aa..9c23f598434c 100644 --- a/arch/riscv/include/asm/insn.h +++ b/arch/riscv/include/asm/insn.h @@ -63,6 +63,7 @@ #define RVG_RS1_OPOFF 15 #define RVG_RS2_OPOFF 20 #define RVG_RD_OPOFF 7 +#define RVG_RS1_MASK GENMASK(4, 0) #define RVG_RD_MASK GENMASK(4, 0) /* The bit field of immediate value in RVC J instruction */ @@ -129,6 +130,7 @@ #define RVC_C2_RS1_OPOFF 7 #define RVC_C2_RS2_OPOFF 2 #define RVC_C2_RD_OPOFF 7 +#define RVC_C2_RS1_MASK GENMASK(4, 0) /* parts of opcode for RVG*/ #define RVG_OPCODE_FENCE 0x0f @@ -278,6 +280,10 @@ static __always_inline bool riscv_insn_is_branch(u32 code) #define RV_X(X, s, mask) (((X) >> (s)) & (mask)) #define RVC_X(X, s, mask) RV_X(X, s, mask) +#define RV_EXTRACT_RS1_REG(x) \ + ({typeof(x) x_ = (x); \ + (RV_X(x_, RVG_RS1_OPOFF, RVG_RS1_MASK)); }) + #define RV_EXTRACT_RD_REG(x) \ ({typeof(x) x_ = (x); \ (RV_X(x_, RVG_RD_OPOFF, RVG_RD_MASK)); }) @@ -305,6 +311,10 @@ static __always_inline bool riscv_insn_is_branch(u32 code) (RV_X(x_, RV_B_IMM_11_OPOFF, RV_B_IMM_11_MASK) << RV_B_IMM_11_OFF) | \ (RV_IMM_SIGN(x_) << RV_B_IMM_SIGN_OFF); }) +#define RVC_EXTRACT_C2_RS1_REG(x) \ + ({typeof(x) x_ = (x); \ + (RV_X(x_, RVC_C2_RS1_OPOFF, RVC_C2_RS1_MASK)); }) + #define RVC_EXTRACT_JTYPE_IMM(x) \ ({typeof(x) x_ = (x); \ (RVC_X(x_, RVC_J_IMM_3_1_OPOFF, RVC_J_IMM_3_1_MASK) << RVC_J_IMM_3_1_OFF) | \ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 506cc4a9a45a..6ac56af42f4a 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -91,6 +91,8 @@ obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o +obj-$(CONFIG_CFI_CLANG) += cfi.o + obj-$(CONFIG_EFI) += efi.o obj-$(CONFIG_COMPAT) += compat_syscall_table.o obj-$(CONFIG_COMPAT) += compat_signal.o diff --git a/arch/riscv/kernel/cfi.c b/arch/riscv/kernel/cfi.c new file mode 100644 index 000000000000..820158d7a291 --- /dev/null +++ b/arch/riscv/kernel/cfi.c @@ -0,0 +1,77 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Clang Control Flow Integrity (CFI) support. + * + * Copyright (C) 2023 Google LLC + */ +#include +#include + +/* + * Returns the target address and the expected type when regs->epc points + * to a compiler-generated CFI trap. + */ +static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target, + u32 *type) +{ + unsigned long *regs_ptr = (unsigned long *)regs; + int rs1_num; + u32 insn; + + *target = *type = 0; + + /* + * The compiler generates the following instruction sequence + * for indirect call checks: + * + *   lw t1, -4() + * lui t2, + * addiw t2, t2, + * beq t1, t2, .Ltmp1 + * ebreak ; <- regs->epc + * .Ltmp1: + * jalr + * + * We can read the expected type and the target address from the + * registers passed to the beq/jalr instructions. + */ + if (get_kernel_nofault(insn, (void *)regs->epc - 4)) + return false; + if (!riscv_insn_is_beq(insn)) + return false; + + *type = (u32)regs_ptr[RV_EXTRACT_RS1_REG(insn)]; + + if (get_kernel_nofault(insn, (void *)regs->epc) || + get_kernel_nofault(insn, (void *)regs->epc + GET_INSN_LENGTH(insn))) + return false; + + if (riscv_insn_is_jalr(insn)) + rs1_num = RV_EXTRACT_RS1_REG(insn); + else if (riscv_insn_is_c_jalr(insn)) + rs1_num = RVC_EXTRACT_C2_RS1_REG(insn); + else + return false; + + *target = regs_ptr[rs1_num]; + + return true; +} + +/* + * Checks if the ebreak trap is because of a CFI failure, and handles the trap + * if needed. Returns a bug_trap_type value similarly to report_bug. + */ +enum bug_trap_type handle_cfi_failure(struct pt_regs *regs) +{ + unsigned long target; + u32 type; + + if (!is_cfi_trap(regs->epc)) + return BUG_TRAP_TYPE_NONE; + + if (!decode_cfi_insn(regs, &target, &type)) + return report_cfi_failure_noaddr(regs, regs->epc); + + return report_cfi_failure(regs, regs->epc, &target, type); +} diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index f910dfccbf5d..212dc20631fb 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -271,7 +272,8 @@ void handle_break(struct pt_regs *regs) == NOTIFY_STOP) return; #endif - else if (report_bug(regs->epc, regs) == BUG_TRAP_TYPE_WARN) + else if (report_bug(regs->epc, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) regs->epc += get_break_insn_length(regs->epc); else die(regs, "Kernel BUG"); From a72ab0361110db51488c670863551eb01428470e Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 10 Jul 2023 18:35:50 +0000 Subject: [PATCH 35/51] riscv/purgatory: Disable CFI Filter out CC_FLAGS_CFI when CONFIG_CFI_CLANG. Reviewed-by: Kees Cook Tested-by: Nathan Chancellor Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230710183544.999540-13-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/purgatory/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/purgatory/Makefile b/arch/riscv/purgatory/Makefile index dc20e166983e..9e6476719abb 100644 --- a/arch/riscv/purgatory/Makefile +++ b/arch/riscv/purgatory/Makefile @@ -77,6 +77,10 @@ ifdef CONFIG_STACKPROTECTOR_STRONG PURGATORY_CFLAGS_REMOVE += -fstack-protector-strong endif +ifdef CONFIG_CFI_CLANG +PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI) +endif + CFLAGS_REMOVE_purgatory.o += $(PURGATORY_CFLAGS_REMOVE) CFLAGS_purgatory.o += $(PURGATORY_CFLAGS) From 74f8fc31feb4b756814ec0720f48ccdc1175f774 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 10 Jul 2023 18:35:51 +0000 Subject: [PATCH 36/51] riscv: Allow CONFIG_CFI_CLANG to be selected Select ARCH_SUPPORTS_CFI_CLANG to allow CFI_CLANG to be selected on riscv. Reviewed-by: Kees Cook Tested-by: Nathan Chancellor Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230710183544.999540-14-samitolvanen@google.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 29fdba9d8514..68c790b181c3 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -43,6 +43,7 @@ config RISCV select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT select ARCH_STACKWALK select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_CFI_CLANG select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU select ARCH_SUPPORTS_HUGETLBFS if MMU select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU From b922bf04d2c1355633bdefbc2ed5fba1f0d4df07 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 11 Jul 2023 23:07:53 +1000 Subject: [PATCH 37/51] binfmt_elf_fdpic: support 64-bit systems The binfmt_flat_fdpic code has a number of 32-bit specific data structures associated with it. Extend it to be able to support and be used on 64-bit systems as well. The new code defines a number of key 64-bit variants of the core elf-fdpic data structures - along side the existing 32-bit sized ones. A common set of generic named structures are defined to be either the 32-bit or 64-bit ones as required at compile time. This is a similar technique to that used in the ELF binfmt loader. For example: elf_fdpic_loadseg is either elf32_fdpic_loadseg or elf64_fdpic_loadseg elf_fdpic_loadmap is either elf32_fdpic_loadmap or elf64_fdpic_loadmap the choice based on ELFCLASS32 or ELFCLASS64. Signed-off-by: Greg Ungerer Acked-by: Kees Cook Link: https://lore.kernel.org/r/20230711130754.481209-2-gerg@kernel.org Signed-off-by: Palmer Dabbelt --- fs/binfmt_elf_fdpic.c | 38 +++++++++++++++++----------------- include/linux/elf-fdpic.h | 14 ++++++++++++- include/uapi/linux/elf-fdpic.h | 15 ++++++++++++++ 3 files changed, 47 insertions(+), 20 deletions(-) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 1c6c5832af86..43b2a2851ba3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -138,7 +138,7 @@ static int is_constdisp(struct elfhdr *hdr) static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, struct file *file) { - struct elf32_phdr *phdr; + struct elf_phdr *phdr; unsigned long size; int retval, loop; loff_t pos = params->hdr.e_phoff; @@ -560,8 +560,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, sp &= ~7UL; /* stack the load map(s) */ - len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * exec_params->loadmap->nsegs; + len = sizeof(struct elf_fdpic_loadmap); + len += sizeof(struct elf_fdpic_loadseg) * exec_params->loadmap->nsegs; sp = (sp - len) & ~7UL; exec_params->map_addr = sp; @@ -571,8 +571,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, current->mm->context.exec_fdpic_loadmap = (unsigned long) sp; if (interp_params->loadmap) { - len = sizeof(struct elf32_fdpic_loadmap); - len += sizeof(struct elf32_fdpic_loadseg) * + len = sizeof(struct elf_fdpic_loadmap); + len += sizeof(struct elf_fdpic_loadseg) * interp_params->loadmap->nsegs; sp = (sp - len) & ~7UL; interp_params->map_addr = sp; @@ -740,13 +740,13 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, struct mm_struct *mm, const char *what) { - struct elf32_fdpic_loadmap *loadmap; + struct elf_fdpic_loadmap *loadmap; #ifdef CONFIG_MMU - struct elf32_fdpic_loadseg *mseg; + struct elf_fdpic_loadseg *mseg; unsigned long load_addr; #endif - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned nloads, tmp; unsigned long stop; int loop, ret; @@ -766,7 +766,7 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, params->loadmap = loadmap; - loadmap->version = ELF32_FDPIC_LOADMAP_VERSION; + loadmap->version = ELF_FDPIC_LOADMAP_VERSION; loadmap->nsegs = nloads; /* map the requested LOADs into the memory space */ @@ -839,8 +839,8 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, if (phdr->p_vaddr >= seg->p_vaddr && phdr->p_vaddr + phdr->p_memsz <= seg->p_vaddr + seg->p_memsz) { - Elf32_Dyn __user *dyn; - Elf32_Sword d_tag; + Elf_Dyn __user *dyn; + Elf_Sword d_tag; params->dynamic_addr = (phdr->p_vaddr - seg->p_vaddr) + @@ -850,11 +850,11 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params, * one item, and that the last item is a NULL * entry */ if (phdr->p_memsz == 0 || - phdr->p_memsz % sizeof(Elf32_Dyn) != 0) + phdr->p_memsz % sizeof(Elf_Dyn) != 0) goto dynamic_error; - tmp = phdr->p_memsz / sizeof(Elf32_Dyn); - dyn = (Elf32_Dyn __user *)params->dynamic_addr; + tmp = phdr->p_memsz / sizeof(Elf_Dyn); + dyn = (Elf_Dyn __user *)params->dynamic_addr; if (get_user(d_tag, &dyn[tmp - 1].d_tag) || d_tag != 0) goto dynamic_error; @@ -923,8 +923,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( struct file *file, struct mm_struct *mm) { - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0; int loop, ret; @@ -1007,8 +1007,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct file *file, struct mm_struct *mm) { - struct elf32_fdpic_loadseg *seg; - struct elf32_phdr *phdr; + struct elf_fdpic_loadseg *seg; + struct elf_phdr *phdr; unsigned long load_addr, delta_vaddr; int loop, dvset; diff --git a/include/linux/elf-fdpic.h b/include/linux/elf-fdpic.h index 3bea95a1af53..e533f4513194 100644 --- a/include/linux/elf-fdpic.h +++ b/include/linux/elf-fdpic.h @@ -10,13 +10,25 @@ #include +#if ELF_CLASS == ELFCLASS32 +#define Elf_Sword Elf32_Sword +#define elf_fdpic_loadseg elf32_fdpic_loadseg +#define elf_fdpic_loadmap elf32_fdpic_loadmap +#define ELF_FDPIC_LOADMAP_VERSION ELF32_FDPIC_LOADMAP_VERSION +#else +#define Elf_Sword Elf64_Sxword +#define elf_fdpic_loadmap elf64_fdpic_loadmap +#define elf_fdpic_loadseg elf64_fdpic_loadseg +#define ELF_FDPIC_LOADMAP_VERSION ELF64_FDPIC_LOADMAP_VERSION +#endif + /* * binfmt binary parameters structure */ struct elf_fdpic_params { struct elfhdr hdr; /* ref copy of ELF header */ struct elf_phdr *phdrs; /* ref copy of PT_PHDR table */ - struct elf32_fdpic_loadmap *loadmap; /* loadmap to be passed to userspace */ + struct elf_fdpic_loadmap *loadmap; /* loadmap to be passed to userspace */ unsigned long elfhdr_addr; /* mapped ELF header user address */ unsigned long ph_addr; /* mapped PT_PHDR user address */ unsigned long map_addr; /* mapped loadmap user address */ diff --git a/include/uapi/linux/elf-fdpic.h b/include/uapi/linux/elf-fdpic.h index 4fcc6cfebe18..ec23f0871129 100644 --- a/include/uapi/linux/elf-fdpic.h +++ b/include/uapi/linux/elf-fdpic.h @@ -32,4 +32,19 @@ struct elf32_fdpic_loadmap { #define ELF32_FDPIC_LOADMAP_VERSION 0x0000 +/* segment mappings for ELF FDPIC libraries/executables/interpreters */ +struct elf64_fdpic_loadseg { + Elf64_Addr addr; /* core address to which mapped */ + Elf64_Addr p_vaddr; /* VMA recorded in file */ + Elf64_Word p_memsz; /* allocation size recorded in file */ +}; + +struct elf64_fdpic_loadmap { + Elf64_Half version; /* version of these structures, just in case... */ + Elf64_Half nsegs; /* number of segments */ + struct elf64_fdpic_loadseg segs[]; +}; + +#define ELF64_FDPIC_LOADMAP_VERSION 0x0000 + #endif /* _UAPI_LINUX_ELF_FDPIC_H */ From 9549fb354ef1a451ceddfa404ae3e943c5c803d0 Mon Sep 17 00:00:00 2001 From: Greg Ungerer Date: Tue, 11 Jul 2023 23:07:54 +1000 Subject: [PATCH 38/51] riscv: support the elf-fdpic binfmt loader Add support for enabling and using the binfmt_elf_fdpic program loader on RISC-V platforms. The most important change is to setup registers during program load to pass the mapping addresses to the new process. One of the interesting features of the elf-fdpic loader is that it also allows appropriately compiled ELF format binaries to be loaded on nommu systems. Appropriate being those compiled with -pie. Signed-off-by: Greg Ungerer Acked-by: Kees Cook Link: https://lore.kernel.org/r/20230711130754.481209-3-gerg@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/elf.h | 11 ++++++++++- arch/riscv/include/asm/mmu.h | 4 ++++ arch/riscv/include/uapi/asm/ptrace.h | 5 +++++ fs/Kconfig.binfmt | 2 +- 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index c24280774caf..c33fe923ef6d 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -41,6 +41,7 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr); #define compat_elf_check_arch compat_elf_check_arch #define CORE_DUMP_USE_REGSET +#define ELF_FDPIC_CORE_EFLAGS 0 #define ELF_EXEC_PAGESIZE (PAGE_SIZE) /* @@ -69,6 +70,13 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr); #define ELF_HWCAP riscv_get_elf_hwcap() extern unsigned long elf_hwcap; +#define ELF_FDPIC_PLAT_INIT(_r, _exec_map_addr, _interp_map_addr, dynamic_addr) \ + do { \ + (_r)->a1 = _exec_map_addr; \ + (_r)->a2 = _interp_map_addr; \ + (_r)->a3 = dynamic_addr; \ + } while (0) + /* * This yields a string that ld.so will use to load implementation * specific libraries for optimization. This is more specific in @@ -78,7 +86,6 @@ extern unsigned long elf_hwcap; #define COMPAT_ELF_PLATFORM (NULL) -#ifdef CONFIG_MMU #define ARCH_DLINFO \ do { \ /* \ @@ -115,6 +122,8 @@ do { \ else \ NEW_AUX_ENT(AT_IGNORE, 0); \ } while (0) + +#ifdef CONFIG_MMU #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, diff --git a/arch/riscv/include/asm/mmu.h b/arch/riscv/include/asm/mmu.h index 0099dc116168..355504b37f8e 100644 --- a/arch/riscv/include/asm/mmu.h +++ b/arch/riscv/include/asm/mmu.h @@ -20,6 +20,10 @@ typedef struct { /* A local icache flush is needed before user execution can resume. */ cpumask_t icache_stale_mask; #endif +#ifdef CONFIG_BINFMT_ELF_FDPIC + unsigned long exec_fdpic_loadmap; + unsigned long interp_fdpic_loadmap; +#endif } mm_context_t; void __init create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, diff --git a/arch/riscv/include/uapi/asm/ptrace.h b/arch/riscv/include/uapi/asm/ptrace.h index e17c550986a6..30f6d6537adc 100644 --- a/arch/riscv/include/uapi/asm/ptrace.h +++ b/arch/riscv/include/uapi/asm/ptrace.h @@ -10,6 +10,11 @@ #include +#define PTRACE_GETFDPIC 33 + +#define PTRACE_GETFDPIC_EXEC 0 +#define PTRACE_GETFDPIC_INTERP 1 + /* * User-mode register state for core dumps, ptrace, sigcontext * diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 93539aac0e5b..f5693164ca9a 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -58,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on ARM || ((M68K || SUPERH || XTENSA) && !MMU) + depends on ARM || ((M68K || RISCV || SUPERH || XTENSA) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load From 2926715163cfacea481d218f9151d091f5c0c27a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 18 Jul 2023 23:22:13 +0800 Subject: [PATCH 39/51] riscv: allow kmalloc() caches aligned to the smallest value Currently, riscv defines ARCH_DMA_MINALIGN as L1_CACHE_BYTES, I.E 64Bytes, if CONFIG_RISCV_DMA_NONCOHERENT=y. To support unified kernel Image, usually we have to enable CONFIG_RISCV_DMA_NONCOHERENT, thus it brings some bad effects to coherent platforms: Firstly, it wastes memory, kmalloc-96, kmalloc-32, kmalloc-16 and kmalloc-8 slab caches don't exist any more, they are replaced with either kmalloc-128 or kmalloc-64. Secondly, larger than necessary kmalloc aligned allocations results in unnecessary cache/TLB pressure. This issue also exists on arm64 platforms. From last year, Catalin tried to solve this issue by decoupling ARCH_KMALLOC_MINALIGN from ARCH_DMA_MINALIGN, limiting kmalloc() minimum alignment to dma_get_cache_alignment() and replacing ARCH_KMALLOC_MINALIGN usage in various drivers with ARCH_DMA_MINALIGN etc.[1] One fact we can make use of for riscv: if the CPU doesn't support ZICBOM or T-HEAD CMO, we know the platform is coherent. Based on Catalin's work and above fact, we can easily solve the kmalloc align issue for riscv: we can override dma_get_cache_alignment(), then let it return ARCH_DMA_MINALIGN at the beginning and return 1 once we know the underlying HW neither supports ZICBOM nor supports T-HEAD CMO. So what about if the CPU supports ZICBOM or T-HEAD CMO, but all the devices are dma coherent? Well, we use ARCH_DMA_MINALIGN as the kmalloc minimum alignment, nothing changed in this case. This case can be improved in the future. After this patch, a simple test of booting to a small buildroot rootfs on qemu shows: kmalloc-96 5041 5041 96 ... kmalloc-64 9606 9606 64 ... kmalloc-32 5128 5128 32 ... kmalloc-16 7682 7682 16 ... kmalloc-8 10246 10246 8 ... So we save about 1268KB memory. The saving will be much larger in normal OS env on real HW platforms. Link: https://lore.kernel.org/linux-arm-kernel/20230524171904.3967031-1-catalin.marinas@arm.com/ [1] Signed-off-by: Jisheng Zhang Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230718152214.2907-2-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cache.h | 14 ++++++++++++++ arch/riscv/include/asm/cacheflush.h | 2 ++ arch/riscv/kernel/setup.c | 1 + arch/riscv/mm/dma-noncoherent.c | 8 ++++++++ 4 files changed, 25 insertions(+) diff --git a/arch/riscv/include/asm/cache.h b/arch/riscv/include/asm/cache.h index d3036df23ccb..2174fe7bac9a 100644 --- a/arch/riscv/include/asm/cache.h +++ b/arch/riscv/include/asm/cache.h @@ -13,6 +13,7 @@ #ifdef CONFIG_RISCV_DMA_NONCOHERENT #define ARCH_DMA_MINALIGN L1_CACHE_BYTES +#define ARCH_KMALLOC_MINALIGN (8) #endif /* @@ -23,4 +24,17 @@ #define ARCH_SLAB_MINALIGN 16 #endif +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_RISCV_DMA_NONCOHERENT +extern int dma_cache_alignment; +#define dma_get_cache_alignment dma_get_cache_alignment +static inline int dma_get_cache_alignment(void) +{ + return dma_cache_alignment; +} +#endif + +#endif /* __ASSEMBLY__ */ + #endif /* _ASM_RISCV_CACHE_H */ diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8091b8bf4883..c640ab6f843b 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -55,8 +55,10 @@ void riscv_init_cbo_blocksizes(void); #ifdef CONFIG_RISCV_DMA_NONCOHERENT void riscv_noncoherent_supported(void); +void __init riscv_set_dma_cache_alignment(void); #else static inline void riscv_noncoherent_supported(void) {} +static inline void riscv_set_dma_cache_alignment(void) {} #endif /* diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 971fe776e2f8..027879b1557a 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -311,6 +311,7 @@ void __init setup_arch(char **cmdline_p) if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) && riscv_isa_extension_available(NULL, ZICBOM)) riscv_noncoherent_supported(); + riscv_set_dma_cache_alignment(); } static int __init topology_init(void) diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c index d51a75864e53..7270b4d8c05b 100644 --- a/arch/riscv/mm/dma-noncoherent.c +++ b/arch/riscv/mm/dma-noncoherent.c @@ -11,6 +11,8 @@ #include static bool noncoherent_supported __ro_after_init; +int dma_cache_alignment __ro_after_init = ARCH_DMA_MINALIGN; +EXPORT_SYMBOL_GPL(dma_cache_alignment); void arch_sync_dma_for_device(phys_addr_t paddr, size_t size, enum dma_data_direction dir) @@ -78,3 +80,9 @@ void riscv_noncoherent_supported(void) "Non-coherent DMA support enabled without a block size\n"); noncoherent_supported = true; } + +void __init riscv_set_dma_cache_alignment(void) +{ + if (!noncoherent_supported) + dma_cache_alignment = 1; +} From f51f7a0fc2f4a6cd786327f485e5aba4c9006866 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 18 Jul 2023 23:22:14 +0800 Subject: [PATCH 40/51] riscv: enable DMA_BOUNCE_UNALIGNED_KMALLOC for !dma_coherent With the DMA bouncing of unaligned kmalloc() buffers now in place, enable it for riscv when RISCV_DMA_NONCOHERENT=y to allow the kmalloc-{8,16,32,96} caches. Since RV32 doesn't enable SWIOTLB yet, and I didn't see any dma noncoherent RV32 platforms in the mainline, so skip RV32 now by only enabling DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB is available. Once we see such requirement on RV32, we can enable it then. NOTE: we didn't force to create the swiotlb buffer even when the end of RAM is within the 32-bit physical address range. That's to say: For RV64 with > 4GB memory, the feature is enabled. For RV64 with <= 4GB memory, the feature isn't enabled by default. We rely on users to pass "swiotlb=mmnn,force" where mmnn is the Number of I/O TLB slabs, see kernel-parameters.txt for details. Tested on Sipeed Lichee Pi 4A with 8GB DDR and Sipeed M1S BL808 Dock board. Signed-off-by: Jisheng Zhang Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230718152214.2907-3-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..6681bd6ed2d7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -267,6 +267,7 @@ config RISCV_DMA_NONCOHERENT select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEVICE + select DMA_BOUNCE_UNALIGNED_KMALLOC if SWIOTLB select DMA_DIRECT_REMAP config AS_HAS_INSN From add2cc6b6515f78d3a150f1fbbaf12c28c4bb20a Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 9 Aug 2023 16:22:01 -0700 Subject: [PATCH 41/51] RISC-V: mm: Restrict address space for sv39,sv48,sv57 Make sv48 the default address space for mmap as some applications currently depend on this assumption. A hint address passed to mmap will cause the largest address space that fits entirely into the hint to be used. If the hint is less than or equal to 1<<38, an sv39 address will be used. An exception is that if the hint address is 0, then a sv48 address will be used. After an address space is completely full, the next smallest address space will be used. Signed-off-by: Charlie Jenkins Link: https://lore.kernel.org/r/20230809232218.849726-2-charlie@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/elf.h | 2 +- arch/riscv/include/asm/pgtable.h | 25 ++++++++++++-- arch/riscv/include/asm/processor.h | 52 ++++++++++++++++++++++++++---- 3 files changed, 70 insertions(+), 9 deletions(-) diff --git a/arch/riscv/include/asm/elf.h b/arch/riscv/include/asm/elf.h index c24280774caf..5d3368d5585c 100644 --- a/arch/riscv/include/asm/elf.h +++ b/arch/riscv/include/asm/elf.h @@ -49,7 +49,7 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr); * the loader. We need to make sure that it is out of the way of the program * that it will "exec", and that there is sufficient room for the brk. */ -#define ELF_ET_DYN_BASE ((TASK_SIZE / 3) * 2) +#define ELF_ET_DYN_BASE ((DEFAULT_MAP_WINDOW / 3) * 2) #ifdef CONFIG_64BIT #ifdef CONFIG_COMPAT diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 75970ee2bda2..bb0b9ac7b581 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -62,11 +62,16 @@ * struct pages to map half the virtual address space. Then * position vmemmap directly below the VMALLOC region. */ +#define VA_BITS_SV32 32 #ifdef CONFIG_64BIT +#define VA_BITS_SV39 39 +#define VA_BITS_SV48 48 +#define VA_BITS_SV57 57 + #define VA_BITS (pgtable_l5_enabled ? \ - 57 : (pgtable_l4_enabled ? 48 : 39)) + VA_BITS_SV57 : (pgtable_l4_enabled ? VA_BITS_SV48 : VA_BITS_SV39)) #else -#define VA_BITS 32 +#define VA_BITS VA_BITS_SV32 #endif #define VMEMMAP_SHIFT \ @@ -111,11 +116,27 @@ #include #include #include +#include #define __page_val_to_pfn(_val) (((_val) & _PAGE_PFN_MASK) >> _PAGE_PFN_SHIFT) #ifdef CONFIG_64BIT #include + +#define VA_USER_SV39 (UL(1) << (VA_BITS_SV39 - 1)) +#define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1)) +#define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1)) + +#ifdef CONFIG_COMPAT +#define MMAP_VA_BITS_64 ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS) +#define MMAP_MIN_VA_BITS_64 (VA_BITS_SV39) +#define MMAP_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_VA_BITS_64) +#define MMAP_MIN_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_MIN_VA_BITS_64) +#else +#define MMAP_VA_BITS ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS) +#define MMAP_MIN_VA_BITS (VA_BITS_SV39) +#endif /* CONFIG_COMPAT */ + #else #include #endif /* CONFIG_64BIT */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index c950a8d9edef..3e23e1786d05 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -13,19 +13,59 @@ #include +#ifdef CONFIG_64BIT +#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1)) +#define STACK_TOP_MAX TASK_SIZE_64 + +#define arch_get_mmap_end(addr, len, flags) \ +({ \ + unsigned long mmap_end; \ + typeof(addr) _addr = (addr); \ + if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \ + mmap_end = STACK_TOP_MAX; \ + else if ((_addr) >= VA_USER_SV57) \ + mmap_end = STACK_TOP_MAX; \ + else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \ + mmap_end = VA_USER_SV48; \ + else \ + mmap_end = VA_USER_SV39; \ + mmap_end; \ +}) + +#define arch_get_mmap_base(addr, base) \ +({ \ + unsigned long mmap_base; \ + typeof(addr) _addr = (addr); \ + typeof(base) _base = (base); \ + unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \ + if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \ + mmap_base = (_base); \ + else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \ + mmap_base = VA_USER_SV57 - rnd_gap; \ + else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \ + mmap_base = VA_USER_SV48 - rnd_gap; \ + else \ + mmap_base = VA_USER_SV39 - rnd_gap; \ + mmap_base; \ +}) + +#else +#define DEFAULT_MAP_WINDOW TASK_SIZE +#define STACK_TOP_MAX TASK_SIZE +#endif +#define STACK_ALIGN 16 + +#define STACK_TOP DEFAULT_MAP_WINDOW + /* * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3) - -#define STACK_TOP TASK_SIZE #ifdef CONFIG_64BIT -#define STACK_TOP_MAX TASK_SIZE_64 +#define TASK_UNMAPPED_BASE PAGE_ALIGN((UL(1) << MMAP_MIN_VA_BITS) / 3) #else -#define STACK_TOP_MAX TASK_SIZE +#define TASK_UNMAPPED_BASE PAGE_ALIGN(TASK_SIZE / 3) #endif -#define STACK_ALIGN 16 #ifndef __ASSEMBLY__ From 4d0c04eac0c2d4e0100bbc67cc2fb48c3a53d8c8 Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 9 Aug 2023 16:22:02 -0700 Subject: [PATCH 42/51] RISC-V: mm: Add tests for RISC-V mm Add tests that enforce mmap hint address behavior. mmap should default to sv48. mmap will provide an address at the highest address space that can fit into the hint address, unless the hint address is less than sv39 and not 0, then it will return a sv39 address. These tests are split into two files: mmap_default.c and mmap_bottomup.c because a new process must be exec'd in order to change the mmap layout. The run_mmap.sh script sets the stack to be unlimited for the mmap_bottomup.c test which triggers a bottomup layout. Signed-off-by: Charlie Jenkins Link: https://lore.kernel.org/r/20230809232218.849726-3-charlie@rivosinc.com Signed-off-by: Palmer Dabbelt --- tools/testing/selftests/riscv/Makefile | 2 +- tools/testing/selftests/riscv/mm/.gitignore | 2 + tools/testing/selftests/riscv/mm/Makefile | 15 +++++ .../riscv/mm/testcases/mmap_bottomup.c | 35 ++++++++++ .../riscv/mm/testcases/mmap_default.c | 35 ++++++++++ .../selftests/riscv/mm/testcases/mmap_test.h | 64 +++++++++++++++++++ .../selftests/riscv/mm/testcases/run_mmap.sh | 12 ++++ 7 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/riscv/mm/.gitignore create mode 100644 tools/testing/selftests/riscv/mm/Makefile create mode 100644 tools/testing/selftests/riscv/mm/testcases/mmap_bottomup.c create mode 100644 tools/testing/selftests/riscv/mm/testcases/mmap_default.c create mode 100644 tools/testing/selftests/riscv/mm/testcases/mmap_test.h create mode 100755 tools/testing/selftests/riscv/mm/testcases/run_mmap.sh diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 9dd629cc86aa..1b79da90396e 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -5,7 +5,7 @@ ARCH ?= $(shell uname -m 2>/dev/null || echo not) ifneq (,$(filter $(ARCH),riscv)) -RISCV_SUBTARGETS ?= hwprobe vector +RISCV_SUBTARGETS ?= hwprobe vector mm else RISCV_SUBTARGETS := endif diff --git a/tools/testing/selftests/riscv/mm/.gitignore b/tools/testing/selftests/riscv/mm/.gitignore new file mode 100644 index 000000000000..5c2c57cb950c --- /dev/null +++ b/tools/testing/selftests/riscv/mm/.gitignore @@ -0,0 +1,2 @@ +mmap_bottomup +mmap_default diff --git a/tools/testing/selftests/riscv/mm/Makefile b/tools/testing/selftests/riscv/mm/Makefile new file mode 100644 index 000000000000..11e0f0568923 --- /dev/null +++ b/tools/testing/selftests/riscv/mm/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2021 ARM Limited +# Originally tools/testing/arm64/abi/Makefile + +# Additional include paths needed by kselftest.h and local headers +CFLAGS += -D_GNU_SOURCE -std=gnu99 -I. + +TEST_GEN_FILES := testcases/mmap_default testcases/mmap_bottomup + +TEST_PROGS := testcases/run_mmap.sh + +include ../../lib.mk + +$(OUTPUT)/mm: testcases/mmap_default.c testcases/mmap_bottomup.c testcases/mmap_tests.h + $(CC) -o$@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/mm/testcases/mmap_bottomup.c b/tools/testing/selftests/riscv/mm/testcases/mmap_bottomup.c new file mode 100644 index 000000000000..b29379f7e478 --- /dev/null +++ b/tools/testing/selftests/riscv/mm/testcases/mmap_bottomup.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include "../../kselftest_harness.h" + +TEST(infinite_rlimit) +{ +// Only works on 64 bit +#if __riscv_xlen == 64 + struct addresses mmap_addresses; + + EXPECT_EQ(BOTTOM_UP, memory_layout()); + + do_mmaps(&mmap_addresses); + + EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr); + + EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint); + EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr); + EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr); + EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr); + EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr); + EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr); + EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr); +#endif +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/testcases/mmap_default.c b/tools/testing/selftests/riscv/mm/testcases/mmap_default.c new file mode 100644 index 000000000000..d1accb91b726 --- /dev/null +++ b/tools/testing/selftests/riscv/mm/testcases/mmap_default.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +#include "../../kselftest_harness.h" + +TEST(default_rlimit) +{ +// Only works on 64 bit +#if __riscv_xlen == 64 + struct addresses mmap_addresses; + + EXPECT_EQ(TOP_DOWN, memory_layout()); + + do_mmaps(&mmap_addresses); + + EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr); + EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr); + + EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint); + EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr); + EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr); + EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr); + EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr); + EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr); + EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr); +#endif +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/riscv/mm/testcases/mmap_test.h b/tools/testing/selftests/riscv/mm/testcases/mmap_test.h new file mode 100644 index 000000000000..9b8434f62f57 --- /dev/null +++ b/tools/testing/selftests/riscv/mm/testcases/mmap_test.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _TESTCASES_MMAP_TEST_H +#define _TESTCASES_MMAP_TEST_H +#include +#include +#include + +#define TOP_DOWN 0 +#define BOTTOM_UP 1 + +struct addresses { + int *no_hint; + int *on_37_addr; + int *on_38_addr; + int *on_46_addr; + int *on_47_addr; + int *on_55_addr; + int *on_56_addr; +}; + +static inline void do_mmaps(struct addresses *mmap_addresses) +{ + /* + * Place all of the hint addresses on the boundaries of mmap + * sv39, sv48, sv57 + * User addresses end at 1<<38, 1<<47, 1<<56 respectively + */ + void *on_37_bits = (void *)(1UL << 37); + void *on_38_bits = (void *)(1UL << 38); + void *on_46_bits = (void *)(1UL << 46); + void *on_47_bits = (void *)(1UL << 47); + void *on_55_bits = (void *)(1UL << 55); + void *on_56_bits = (void *)(1UL << 56); + + int prot = PROT_READ | PROT_WRITE; + int flags = MAP_PRIVATE | MAP_ANONYMOUS; + + mmap_addresses->no_hint = + mmap(NULL, 5 * sizeof(int), prot, flags, 0, 0); + mmap_addresses->on_37_addr = + mmap(on_37_bits, 5 * sizeof(int), prot, flags, 0, 0); + mmap_addresses->on_38_addr = + mmap(on_38_bits, 5 * sizeof(int), prot, flags, 0, 0); + mmap_addresses->on_46_addr = + mmap(on_46_bits, 5 * sizeof(int), prot, flags, 0, 0); + mmap_addresses->on_47_addr = + mmap(on_47_bits, 5 * sizeof(int), prot, flags, 0, 0); + mmap_addresses->on_55_addr = + mmap(on_55_bits, 5 * sizeof(int), prot, flags, 0, 0); + mmap_addresses->on_56_addr = + mmap(on_56_bits, 5 * sizeof(int), prot, flags, 0, 0); +} + +static inline int memory_layout(void) +{ + int prot = PROT_READ | PROT_WRITE; + int flags = MAP_PRIVATE | MAP_ANONYMOUS; + + void *value1 = mmap(NULL, sizeof(int), prot, flags, 0, 0); + void *value2 = mmap(NULL, sizeof(int), prot, flags, 0, 0); + + return value2 > value1; +} +#endif /* _TESTCASES_MMAP_TEST_H */ diff --git a/tools/testing/selftests/riscv/mm/testcases/run_mmap.sh b/tools/testing/selftests/riscv/mm/testcases/run_mmap.sh new file mode 100755 index 000000000000..ca5ad7c48bad --- /dev/null +++ b/tools/testing/selftests/riscv/mm/testcases/run_mmap.sh @@ -0,0 +1,12 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +original_stack_limit=$(ulimit -s) + +./mmap_default + +# Force mmap_bottomup to be ran with bottomup memory due to +# the unlimited stack +ulimit -s unlimited +./mmap_bottomup +ulimit -s $original_stack_limit From 26eee2bfc477c536d65380cd82f458c91d29317a Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 9 Aug 2023 16:22:03 -0700 Subject: [PATCH 43/51] RISC-V: mm: Update pgtable comment documentation sv57 is supported in the kernel so pgtable.h should reflect that. Signed-off-by: Charlie Jenkins Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230809232218.849726-4-charlie@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index bb0b9ac7b581..2c5f6c8edc8a 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -851,14 +851,16 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) * Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32. * Note that PGDIR_SIZE must evenly divide TASK_SIZE. * Task size is: - * - 0x9fc00000 (~2.5GB) for RV32. - * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu - * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * - 0x9fc00000 (~2.5GB) for RV32. + * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu + * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu + * - 0x100000000000000 ( 64PB) for RV64 using SV57 mmu * * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V * Instruction Set Manual Volume II: Privileged Architecture" states that * "load and store effective addresses, which are 64bits, must have bits * 63–48 all equal to bit 47, or else a page-fault exception will occur." + * Similarly for SV57, bits 63–57 must be equal to bit 56. */ #ifdef CONFIG_64BIT #define TASK_SIZE_64 (PGDIR_SIZE * PTRS_PER_PGD / 2) From 7998abe69d3c4cd611d586384fa33f561c1bd61e Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Wed, 9 Aug 2023 16:22:04 -0700 Subject: [PATCH 44/51] RISC-V: mm: Document mmap changes The behavior of mmap is modified with this patch series, so explain the changes to the mmap hint address behavior. Signed-off-by: Charlie Jenkins Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230809232218.849726-5-charlie@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/vm-layout.rst | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/Documentation/riscv/vm-layout.rst b/Documentation/riscv/vm-layout.rst index 5462c84f4723..69ff6da1dbf8 100644 --- a/Documentation/riscv/vm-layout.rst +++ b/Documentation/riscv/vm-layout.rst @@ -133,3 +133,25 @@ RISC-V Linux Kernel SV57 ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | modules, BPF ffffffff80000000 | -2 GB | ffffffffffffffff | 2 GB | kernel __________________|____________|__________________|_________|____________________________________________________________ + + +Userspace VAs +-------------------- +To maintain compatibility with software that relies on the VA space with a +maximum of 48 bits the kernel will, by default, return virtual addresses to +userspace from a 48-bit range (sv48). This default behavior is achieved by +passing 0 into the hint address parameter of mmap. On CPUs with an address space +smaller than sv48, the CPU maximum supported address space will be the default. + +Software can "opt-in" to receiving VAs from another VA space by providing +a hint address to mmap. A hint address passed to mmap will cause the largest +address space that fits entirely into the hint to be used, unless there is no +space left in the address space. If there is no space available in the requested +address space, an address in the next smallest available address space will be +returned. + +For example, in order to obtain 48-bit VA space, a hint address greater than +:code:`1 << 47` must be provided. Note that this is 47 due to sv48 userspace +ending at :code:`1 << 47` and the addresses beyond this are reserved for the +kernel. Similarly, to obtain 57-bit VA space addresses, a hint address greater +than or equal to :code:`1 << 56` must be provided. From 6b289a3ffa562070556ca66b766a88b1563d7759 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Tue, 25 Jul 2023 07:38:35 +0200 Subject: [PATCH 45/51] riscv: remove redundant mv instructions Some mv instructions were useful when first introduced to preserve a0 and a1 before function calls. However the code has changed and they are now redundant. Remove them. Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230725053835.138910-1-namcaov@gmail.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/head.S | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 11c3b94c4534..3710ea5d160f 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -289,10 +289,6 @@ clear_bss: blt a3, a4, clear_bss clear_bss_done: #endif - /* Save hart ID and DTB physical address */ - mv s0, a0 - mv s1, a1 - la a2, boot_cpu_hartid XIP_FIXUP_OFFSET a2 REG_S a0, (a2) @@ -306,7 +302,7 @@ clear_bss_done: la a0, __dtb_start XIP_FIXUP_OFFSET a0 #else - mv a0, s1 + mv a0, a1 #endif /* CONFIG_BUILTIN_DTB */ call setup_vm #ifdef CONFIG_MMU From 3ed8513cae19c01bac4466d24a497a7fd6e2cf56 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Fri, 28 Jul 2023 00:03:56 +0800 Subject: [PATCH 46/51] riscv: enable DEBUG_FORCE_FUNCTION_ALIGN_64B Allow to force all function address 64B aligned as it is possible for other architectures. This may be useful when verify if performance bump is caused by function alignment changes. Before commit 1bf18da62106 ("lib/Kconfig.debug: add ARCH dependency for FUNCTION_ALIGN option"), riscv supports enabling the DEBUG_FORCE_FUNCTION_ALIGN_64B option, but after that commit, each arch needs to claim the support explicitly. Signed-off-by: Jisheng Zhang Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230727160356.3874-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index fbc89baf7de6..39ffd218e960 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -502,7 +502,7 @@ config SECTION_MISMATCH_WARN_ONLY config DEBUG_FORCE_FUNCTION_ALIGN_64B bool "Force all function address 64B aligned" - depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC || S390) + depends on EXPERT && (X86_64 || ARM64 || PPC32 || PPC64 || ARC || RISCV || S390) select FUNCTION_ALIGNMENT_64B help There are cases that a commit from one domain changes the function From 665c51f644433f4f976ffc13e14523aaceacf9fe Mon Sep 17 00:00:00 2001 From: Ye Xingchen Date: Sat, 6 May 2023 17:11:41 +0800 Subject: [PATCH 47/51] riscv: mm: use bitmap_zero() API bitmap_zero() is faster than bitmap_clear(), so use bitmap_zero() instead of bitmap_clear(). Signed-off-by: Ye Xingchen Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/202305061711417142802@zte.com.cn Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index 12e22e7330e7..217fd4de6134 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -67,7 +67,7 @@ static void __flush_context(void) lockdep_assert_held(&context_lock); /* Update the list of reserved ASIDs and the ASID bitmap. */ - bitmap_clear(context_asid_map, 0, num_asids); + bitmap_zero(context_asid_map, num_asids); /* Mark already active ASIDs as used */ for_each_possible_cpu(i) { From dd7664d67b478afeb79a89e4586c2cd7707d17d6 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 4 Jul 2023 09:43:56 +0200 Subject: [PATCH 48/51] riscv: Mark KASAN tmp* page tables variables as static tmp_pg_dir, tmp_p4d and tmp_pud are only used in kasan_init.c so they should be declared as static. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306282202.bODptiGE-lkp@intel.com/ Fixes: 96f9d4daf745 ("riscv: Rework kasan population functions") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230704074357.233982-1-alexghiti@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 8fc0efcf905c..b88914741f3d 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -23,9 +23,9 @@ */ extern pgd_t early_pg_dir[PTRS_PER_PGD]; -pgd_t tmp_pg_dir[PTRS_PER_PGD] __page_aligned_bss; -p4d_t tmp_p4d[PTRS_PER_P4D] __page_aligned_bss; -pud_t tmp_pud[PTRS_PER_PUD] __page_aligned_bss; +static pgd_t tmp_pg_dir[PTRS_PER_PGD] __page_aligned_bss; +static p4d_t tmp_p4d[PTRS_PER_P4D] __page_aligned_bss; +static pud_t tmp_pud[PTRS_PER_PUD] __page_aligned_bss; static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end) { From 9bdd924803787ceeb10f1ea399e91d75fb05d3a7 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 4 Jul 2023 09:43:57 +0200 Subject: [PATCH 49/51] riscv: Move create_tmp_mapping() to init sections This function is only used at boot time so mark it as __init. Fixes: 96f9d4daf745 ("riscv: Rework kasan population functions") Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20230704074357.233982-2-alexghiti@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index b88914741f3d..435e94a5b1bb 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -439,7 +439,7 @@ static void __init kasan_shallow_populate(void *start, void *end) kasan_shallow_populate_pgd(vaddr, vend); } -static void create_tmp_mapping(void) +static void __init create_tmp_mapping(void) { void *ptr; p4d_t *base_p4d; From 4e90d0522a688371402ced1d1958ee7381b81f05 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 17 Jul 2023 00:49:25 +0800 Subject: [PATCH 50/51] riscv: support PREEMPT_DYNAMIC with static keys Currently, each architecture can support PREEMPT_DYNAMIC through either static calls or static keys. To support PREEMPT_DYNAMIC on riscv, we face three choices: 1. only add static calls support to riscv As Mark pointed out in commit 99cf983cc8bc ("sched/preempt: Add PREEMPT_DYNAMIC using static keys"), static keys "...should have slightly lower overhead than non-inline static calls, as this effectively inlines each trampoline into the start of its callee. This may avoid redundant work, and may integrate better with CFI schemes." So even we add static calls(without inline static calls) to riscv, static keys is still a better choice. 2. add static calls and inline static calls to riscv Per my understanding, inline static calls requires objtool support which is not easy. 3. use static keys While riscv doesn't have static calls support, it supports static keys perfectly. So this patch selects HAVE_PREEMPT_DYNAMIC_KEY to enable support for PREEMPT_DYNAMIC on riscv, so that the preemption model can be chosen at boot time. It also patches asm-generic/preempt.h, mainly to add __preempt_schedule() and __preempt_schedule_notrace() macros for PREEMPT_DYNAMIC case. Other architectures which use generic preempt.h can also benefit from this patch by simply selecting HAVE_PREEMPT_DYNAMIC_KEY to enable PREEMPT_DYNAMIC if they supports static keys. Signed-off-by: Jisheng Zhang Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20230716164925.1858-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + include/asm-generic/preempt.h | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 1494bca1b53a..f3c6c7d3acb5 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -134,6 +134,7 @@ config RISCV select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_PREEMPT_DYNAMIC_KEY if !XIP_KERNEL select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RETHOOK if !XIP_KERNEL select HAVE_RSEQ diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index b4d43a4af5f7..51f8f3881523 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -80,9 +80,21 @@ static __always_inline bool should_resched(int preempt_offset) #ifdef CONFIG_PREEMPTION extern asmlinkage void preempt_schedule(void); -#define __preempt_schedule() preempt_schedule() extern asmlinkage void preempt_schedule_notrace(void); + +#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) + +void dynamic_preempt_schedule(void); +void dynamic_preempt_schedule_notrace(void); +#define __preempt_schedule() dynamic_preempt_schedule() +#define __preempt_schedule_notrace() dynamic_preempt_schedule_notrace() + +#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_KEY*/ + +#define __preempt_schedule() preempt_schedule() #define __preempt_schedule_notrace() preempt_schedule_notrace() + +#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_KEY*/ #endif /* CONFIG_PREEMPTION */ #endif /* __ASM_PREEMPT_H */ From 89775a27ff6d0396b44de0d6f44dcbc25221fdda Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 16 Aug 2023 10:35:43 -0700 Subject: [PATCH 51/51] lib/Kconfig.debug: Restrict DEBUG_INFO_SPLIT for RISC-V When building for ARCH=riscv using LLVM < 14, there is an error with CONFIG_DEBUG_INFO_SPLIT=y: error: A dwo section may not contain relocations This was worked around in LLVM 15 by disallowing '-gsplit-dwarf' with '-mrelax' (the default), so CONFIG_DEBUG_INFO_SPLIT is not selectable with newer versions of LLVM: $ clang --target=riscv64-linux-gnu -gsplit-dwarf -c -o /dev/null -x c /dev/null clang: error: -gsplit-dwarf is unsupported with RISC-V linker relaxation (-mrelax) GCC silently had a similar issue that was resolved with GCC 12.x. Restrict CONFIG_DEBUG_INFO_SPLIT for RISC-V when using LLVM or GCC < 12.x to avoid these known issues. Link: https://github.com/ClangBuiltLinux/linux/issues/1914 Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99090 Reported-by: kernel test robot Closes: https://lore.kernel.org/all/202308090204.9yZffBWo-lkp@intel.com/ Signed-off-by: Nathan Chancellor Reviewed-by: Fangrui Song Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20230816-riscv-debug_info_split-v1-1-d1019d6ccc11@kernel.org Signed-off-by: Palmer Dabbelt --- lib/Kconfig.debug | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 39ffd218e960..7bad297b48fb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -355,6 +355,11 @@ endchoice # "Compressed Debug information" config DEBUG_INFO_SPLIT bool "Produce split debuginfo in .dwo files" depends on $(cc-option,-gsplit-dwarf) + # RISC-V linker relaxation + -gsplit-dwarf has issues with LLVM and GCC + # prior to 12.x: + # https://github.com/llvm/llvm-project/issues/56642 + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99090 + depends on !RISCV || GCC_VERSION >= 120000 help Generate debug info into separate .dwo files. This significantly reduces the build directory size for builds with DEBUG_INFO,