From 3e42e72796d8991fecad78d61a180e24a4853427 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 1 Apr 2024 00:22:50 +0200 Subject: [PATCH 01/85] powerpc: Use str_plural() in cpu_init_thread_core_maps() Fixes the following Coccinelle/coccicheck warning reported by string_choices.cocci: opportunity for str_plural(tpc) Signed-off-by: Thorsten Blum Signed-off-by: Michael Ellerman Link: https://msgid.link/20240331222249.107467-2-thorsten.blum@toblux.com --- arch/powerpc/kernel/setup-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 01ed1263e1a9..4bd2f87616ba 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -405,7 +405,7 @@ static void __init cpu_init_thread_core_maps(int tpc) cpumask_set_cpu(i, &threads_core_mask); printk(KERN_INFO "CPU maps initialized for %d thread%s per core\n", - tpc, tpc > 1 ? "s" : ""); + tpc, str_plural(tpc)); printk(KERN_DEBUG " (thread shift is %d)\n", threads_shift); } From 01acaf3aa75e1641442cc23d8fe0a7bb4226efb1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 3 Apr 2024 10:06:19 +0200 Subject: [PATCH 02/85] powerpc/fsl-soc: hide unused const variable vmpic_msi_feature is only used conditionally, which triggers a rare -Werror=unused-const-variable= warning with gcc: arch/powerpc/sysdev/fsl_msi.c:567:37: error: 'vmpic_msi_feature' defined but not used [-Werror=unused-const-variable=] 567 | static const struct fsl_msi_feature vmpic_msi_feature = Hide this one in the same #ifdef as the reference so we can turn on the warning by default. Fixes: 305bcf26128e ("powerpc/fsl-soc: use CONFIG_EPAPR_PARAVIRT for hcalls") Signed-off-by: Arnd Bergmann Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/20240403080702.3509288-2-arnd@kernel.org --- arch/powerpc/sysdev/fsl_msi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 8e6c84df4ca1..e205135ae1fe 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -564,10 +564,12 @@ static const struct fsl_msi_feature ipic_msi_feature = { .msiir_offset = 0x38, }; +#ifdef CONFIG_EPAPR_PARAVIRT static const struct fsl_msi_feature vmpic_msi_feature = { .fsl_pic_ip = FSL_PIC_IP_VMPIC, .msiir_offset = 0, }; +#endif static const struct of_device_id fsl_of_msi_ids[] = { { From 608d4a5ca56302181e669cea0aa571cbec6680eb Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Tue, 26 Mar 2024 15:44:20 +1100 Subject: [PATCH 03/85] powerpc: Error on assembly warnings We currently enable -Werror on the arch/powerpc subtree. However this only catches C warnings. Assembly warnings are logged, but the make invocation will still succeed. This can allow incorrect syntax such as ori r3, r4, r5 to be compiled without catching that the assembler is treating r5 as the immediate value 5. To prevent this in assembly files and inline assembly, add the -fatal-warnings option to assembler invocations. Signed-off-by: Benjamin Gray Tested-by: Andrew Donnellan Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326044420.577031-1-bgray@linux.ibm.com --- arch/powerpc/Kbuild | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild index 22cd0d55a892..da862e9558bc 100644 --- a/arch/powerpc/Kbuild +++ b/arch/powerpc/Kbuild @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -Wa,-fatal-warnings +subdir-asflags-$(CONFIG_PPC_WERROR) := -Wa,-fatal-warnings obj-y += kernel/ obj-y += mm/ From bfe51886ca544956eb4ff924d1937ac01d0ca9c8 Mon Sep 17 00:00:00 2001 From: Geoff Levand Date: Mon, 1 Apr 2024 16:08:31 +0900 Subject: [PATCH 04/85] powerpc: Fix PS3 allmodconfig warning The struct ps3_notification_device in the ps3_probe_thread routine is too large to be on the stack, causing a warning for an allmodconfig build with clang. Change the struct ps3_notification_device from a variable on the stack to a dynamically allocated variable. Reported-by: Arnd Bergmann Signed-off-by: Geoff Levand Signed-off-by: Michael Ellerman Link: https://msgid.link/d64f06f4-81ae-4ec5-ab3b-d7f7f091e0ac@infradead.org --- arch/powerpc/platforms/ps3/device-init.c | 61 +++++++++++++----------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c index 878bc160246e..b18e1c92e554 100644 --- a/arch/powerpc/platforms/ps3/device-init.c +++ b/arch/powerpc/platforms/ps3/device-init.c @@ -770,49 +770,51 @@ static struct task_struct *probe_task; static int ps3_probe_thread(void *data) { - struct ps3_notification_device dev; + struct { + struct ps3_notification_device dev; + u8 buf[512]; + } *local; + struct ps3_notify_cmd *notify_cmd; + struct ps3_notify_event *notify_event; int res; unsigned int irq; u64 lpar; - void *buf; - struct ps3_notify_cmd *notify_cmd; - struct ps3_notify_event *notify_event; pr_debug(" -> %s:%u: kthread started\n", __func__, __LINE__); - buf = kzalloc(512, GFP_KERNEL); - if (!buf) + local = kzalloc(sizeof(*local), GFP_KERNEL); + if (!local) return -ENOMEM; - lpar = ps3_mm_phys_to_lpar(__pa(buf)); - notify_cmd = buf; - notify_event = buf; + lpar = ps3_mm_phys_to_lpar(__pa(&local->buf)); + notify_cmd = (struct ps3_notify_cmd *)&local->buf; + notify_event = (struct ps3_notify_event *)&local->buf; /* dummy system bus device */ - dev.sbd.bus_id = (u64)data; - dev.sbd.dev_id = PS3_NOTIFICATION_DEV_ID; - dev.sbd.interrupt_id = PS3_NOTIFICATION_INTERRUPT_ID; + local->dev.sbd.bus_id = (u64)data; + local->dev.sbd.dev_id = PS3_NOTIFICATION_DEV_ID; + local->dev.sbd.interrupt_id = PS3_NOTIFICATION_INTERRUPT_ID; - res = lv1_open_device(dev.sbd.bus_id, dev.sbd.dev_id, 0); + res = lv1_open_device(local->dev.sbd.bus_id, local->dev.sbd.dev_id, 0); if (res) { pr_err("%s:%u: lv1_open_device failed %s\n", __func__, __LINE__, ps3_result(res)); goto fail_free; } - res = ps3_sb_event_receive_port_setup(&dev.sbd, PS3_BINDING_CPU_ANY, - &irq); + res = ps3_sb_event_receive_port_setup(&local->dev.sbd, + PS3_BINDING_CPU_ANY, &irq); if (res) { pr_err("%s:%u: ps3_sb_event_receive_port_setup failed %d\n", __func__, __LINE__, res); goto fail_close_device; } - spin_lock_init(&dev.lock); - rcuwait_init(&dev.wait); + spin_lock_init(&local->dev.lock); + rcuwait_init(&local->dev.wait); res = request_irq(irq, ps3_notification_interrupt, 0, - "ps3_notification", &dev); + "ps3_notification", &local->dev); if (res) { pr_err("%s:%u: request_irq failed %d\n", __func__, __LINE__, res); @@ -823,7 +825,7 @@ static int ps3_probe_thread(void *data) notify_cmd->operation_code = 0; /* must be zero */ notify_cmd->event_mask = 1UL << notify_region_probe; - res = ps3_notification_read_write(&dev, lpar, 1); + res = ps3_notification_read_write(&local->dev, lpar, 1); if (res) goto fail_free_irq; @@ -834,36 +836,37 @@ static int ps3_probe_thread(void *data) memset(notify_event, 0, sizeof(*notify_event)); - res = ps3_notification_read_write(&dev, lpar, 0); + res = ps3_notification_read_write(&local->dev, lpar, 0); if (res) break; pr_debug("%s:%u: notify event type 0x%llx bus id %llu dev id %llu" " type %llu port %llu\n", __func__, __LINE__, - notify_event->event_type, notify_event->bus_id, - notify_event->dev_id, notify_event->dev_type, - notify_event->dev_port); + notify_event->event_type, notify_event->bus_id, + notify_event->dev_id, notify_event->dev_type, + notify_event->dev_port); if (notify_event->event_type != notify_region_probe || - notify_event->bus_id != dev.sbd.bus_id) { + notify_event->bus_id != local->dev.sbd.bus_id) { pr_warn("%s:%u: bad notify_event: event %llu, dev_id %llu, dev_type %llu\n", __func__, __LINE__, notify_event->event_type, notify_event->dev_id, notify_event->dev_type); continue; } - ps3_find_and_add_device(dev.sbd.bus_id, notify_event->dev_id); + ps3_find_and_add_device(local->dev.sbd.bus_id, + notify_event->dev_id); } while (!kthread_should_stop()); fail_free_irq: - free_irq(irq, &dev); + free_irq(irq, &local->dev); fail_sb_event_receive_port_destroy: - ps3_sb_event_receive_port_destroy(&dev.sbd, irq); + ps3_sb_event_receive_port_destroy(&local->dev.sbd, irq); fail_close_device: - lv1_close_device(dev.sbd.bus_id, dev.sbd.dev_id); + lv1_close_device(local->dev.sbd.bus_id, local->dev.sbd.dev_id); fail_free: - kfree(buf); + kfree(local); probe_task = NULL; From 8884fc918f6aee220f9b41806974508bd0213aca Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Fri, 5 Apr 2024 12:31:22 -0700 Subject: [PATCH 05/85] powerpc: Fix fatal warnings flag for LLVM's integrated assembler When building with LLVM_IAS=1, there is an error because '-fatal-warnings' is not recognized as a valid flag: clang: error: unsupported argument '-fatal-warnings' to option '-Wa,' Use the double hyphen version of the flag, '--fatal-warnings', which works with both the GNU assembler and LLVM's integrated assembler. Fixes: 608d4a5ca563 ("powerpc: Error on assembly warnings") Signed-off-by: Nathan Chancellor Reviewed-by: Justin Stitt Signed-off-by: Michael Ellerman Link: https://msgid.link/20240405-ppc-fix-wa-fatal-warnings-clang-v1-1-bdcd969f2ef0@kernel.org --- arch/powerpc/Kbuild | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/Kbuild b/arch/powerpc/Kbuild index da862e9558bc..571f260b0842 100644 --- a/arch/powerpc/Kbuild +++ b/arch/powerpc/Kbuild @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -Wa,-fatal-warnings -subdir-asflags-$(CONFIG_PPC_WERROR) := -Wa,-fatal-warnings +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror -Wa,--fatal-warnings +subdir-asflags-$(CONFIG_PPC_WERROR) := -Wa,--fatal-warnings obj-y += kernel/ obj-y += mm/ From 676abf7c39267080ab81597c6d4f372a10c0fc21 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 13 Mar 2024 15:56:45 +0200 Subject: [PATCH 06/85] powerpc/52xx: Replace of_gpio.h by proper one of_gpio.h is deprecated and subject to remove. The driver doesn't use it directly, replace it with what is really being used. Signed-off-by: Andy Shevchenko Signed-off-by: Michael Ellerman Link: https://msgid.link/20240313135645.2066362-1-andriy.shevchenko@linux.intel.com --- arch/powerpc/platforms/52xx/mpc52xx_common.c | 2 -- arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/52xx/mpc52xx_common.c b/arch/powerpc/platforms/52xx/mpc52xx_common.c index b4938e344f71..253421ffb4e5 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_common.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_common.c @@ -12,12 +12,10 @@ #undef DEBUG -#include #include #include #include #include -#include #include #include #include diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 581059527c36..2bd6abcdc113 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -48,6 +48,7 @@ * the output mode. This driver does not change the output mode setting. */ +#include #include #include #include @@ -56,7 +57,6 @@ #include #include #include -#include #include #include #include From 676b2f99b0f6cd11193eeae13c976565c3fc7545 Mon Sep 17 00:00:00 2001 From: Nicholas Miehlbradt Date: Mon, 8 Apr 2024 05:23:58 +0000 Subject: [PATCH 07/85] powerpc: Add static_key_feature_checks_initialized flag JUMP_LABEL_FEATURE_CHECK_DEBUG used static_key_intialized to determine whether {cpu,mmu}_has_feature() is used before static keys were initialized. However, {cpu,mmu}_has_feature() should not be used before setup_feature_keys() is called but static_key_initialized is set well before this by the call to jump_label_init() in early_init_devtree(). This creates a window in which JUMP_LABEL_FEATURE_CHECK_DEBUG will not detect misuse and report errors. Add a flag specifically to indicate when {cpu,mmu}_has_feature() is safe to use. Signed-off-by: Nicholas Miehlbradt Signed-off-by: Michael Ellerman Link: https://msgid.link/20240408052358.5030-1-nicholas@linux.ibm.com --- arch/powerpc/include/asm/cpu_has_feature.h | 2 +- arch/powerpc/include/asm/feature-fixups.h | 2 ++ arch/powerpc/include/asm/mmu.h | 2 +- arch/powerpc/lib/feature-fixups.c | 8 ++++++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/cpu_has_feature.h b/arch/powerpc/include/asm/cpu_has_feature.h index 727d4b321937..0efabccd820c 100644 --- a/arch/powerpc/include/asm/cpu_has_feature.h +++ b/arch/powerpc/include/asm/cpu_has_feature.h @@ -29,7 +29,7 @@ static __always_inline bool cpu_has_feature(unsigned long feature) #endif #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG - if (!static_key_initialized) { + if (!static_key_feature_checks_initialized) { printk("Warning! cpu_has_feature() used prior to jump label init!\n"); dump_stack(); return early_cpu_has_feature(feature); diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h index 77824bd289a3..17d168dd8b49 100644 --- a/arch/powerpc/include/asm/feature-fixups.h +++ b/arch/powerpc/include/asm/feature-fixups.h @@ -291,6 +291,8 @@ extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; extern long __start___barrier_nospec_fixup, __stop___barrier_nospec_fixup; extern long __start__btb_flush_fixup, __stop__btb_flush_fixup; +extern bool static_key_feature_checks_initialized; + void apply_feature_fixups(void); void update_mmu_feature_fixups(unsigned long mask); void setup_feature_keys(void); diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 3b72c7ed24cf..24f830cf9bb4 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -251,7 +251,7 @@ static __always_inline bool mmu_has_feature(unsigned long feature) #endif #ifdef CONFIG_JUMP_LABEL_FEATURE_CHECK_DEBUG - if (!static_key_initialized) { + if (!static_key_feature_checks_initialized) { printk("Warning! mmu_has_feature() used prior to jump label init!\n"); dump_stack(); return early_mmu_has_feature(feature); diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index 4f82581ca203..b7201ba50b2e 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -25,6 +25,13 @@ #include #include +/* + * Used to generate warnings if mmu or cpu feature check functions that + * use static keys before they are initialized. + */ +bool static_key_feature_checks_initialized __read_mostly; +EXPORT_SYMBOL_GPL(static_key_feature_checks_initialized); + struct fixup_entry { unsigned long mask; unsigned long value; @@ -679,6 +686,7 @@ void __init setup_feature_keys(void) jump_label_init(); cpu_feature_keys_init(); mmu_feature_keys_init(); + static_key_feature_checks_initialized = true; } static int __init check_features(void) From 0db880fc865ffb522141ced4bfa66c12ab1fbb70 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Wed, 10 Apr 2024 10:00:06 +0530 Subject: [PATCH 08/85] powerpc: Avoid nmi_enter/nmi_exit in real mode interrupt. nmi_enter()/nmi_exit() touches per cpu variables which can lead to kernel crash when invoked during real mode interrupt handling (e.g. early HMI/MCE interrupt handler) if percpu allocation comes from vmalloc area. Early HMI/MCE handlers are called through DEFINE_INTERRUPT_HANDLER_NMI() wrapper which invokes nmi_enter/nmi_exit calls. We don't see any issue when percpu allocation is from the embedded first chunk. However with CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there are chances where percpu allocation can come from the vmalloc area. With kernel command line "percpu_alloc=page" we can force percpu allocation to come from vmalloc area and can see kernel crash in machine_check_early: [ 1.215714] NIP [c000000000e49eb4] rcu_nmi_enter+0x24/0x110 [ 1.215717] LR [c0000000000461a0] machine_check_early+0xf0/0x2c0 [ 1.215719] --- interrupt: 200 [ 1.215720] [c000000fffd73180] [0000000000000000] 0x0 (unreliable) [ 1.215722] [c000000fffd731b0] [0000000000000000] 0x0 [ 1.215724] [c000000fffd73210] [c000000000008364] machine_check_early_common+0x134/0x1f8 Fix this by avoiding use of nmi_enter()/nmi_exit() in real mode if percpu first chunk is not embedded. Reviewed-by: Christophe Leroy Tested-by: Shirisha Ganta Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman Link: https://msgid.link/20240410043006.81577-1-mahesh@linux.ibm.com --- arch/powerpc/include/asm/interrupt.h | 10 ++++++++++ arch/powerpc/include/asm/percpu.h | 10 ++++++++++ arch/powerpc/kernel/setup_64.c | 2 ++ 3 files changed, 22 insertions(+) diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h index 7b610864b364..2d6c886b40f4 100644 --- a/arch/powerpc/include/asm/interrupt.h +++ b/arch/powerpc/include/asm/interrupt.h @@ -336,6 +336,14 @@ static inline void interrupt_nmi_enter_prepare(struct pt_regs *regs, struct inte if (IS_ENABLED(CONFIG_KASAN)) return; + /* + * Likewise, do not use it in real mode if percpu first chunk is not + * embedded. With CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK enabled there + * are chances where percpu allocation can come from vmalloc area. + */ + if (percpu_first_chunk_is_paged) + return; + /* Otherwise, it should be safe to call it */ nmi_enter(); } @@ -351,6 +359,8 @@ static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct inter // no nmi_exit for a pseries hash guest taking a real mode exception } else if (IS_ENABLED(CONFIG_KASAN)) { // no nmi_exit for KASAN in real mode + } else if (percpu_first_chunk_is_paged) { + // no nmi_exit if percpu first chunk is not embedded } else { nmi_exit(); } diff --git a/arch/powerpc/include/asm/percpu.h b/arch/powerpc/include/asm/percpu.h index 8e5b7d0b851c..634970ce13c6 100644 --- a/arch/powerpc/include/asm/percpu.h +++ b/arch/powerpc/include/asm/percpu.h @@ -15,6 +15,16 @@ #endif /* CONFIG_SMP */ #endif /* __powerpc64__ */ +#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) && defined(CONFIG_SMP) +#include +DECLARE_STATIC_KEY_FALSE(__percpu_first_chunk_is_paged); + +#define percpu_first_chunk_is_paged \ + (static_key_enabled(&__percpu_first_chunk_is_paged.key)) +#else +#define percpu_first_chunk_is_paged false +#endif /* CONFIG_PPC64 && CONFIG_SMP */ + #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 2f19d5e94485..ae36a129789f 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -834,6 +834,7 @@ static __init int pcpu_cpu_to_node(int cpu) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); +DEFINE_STATIC_KEY_FALSE(__percpu_first_chunk_is_paged); void __init setup_per_cpu_areas(void) { @@ -876,6 +877,7 @@ void __init setup_per_cpu_areas(void) if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); + static_key_enable(&__percpu_first_chunk_is_paged.key); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) { __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; From f318c8be797f8572629d5386a88cde7d753457a8 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Wed, 17 Apr 2024 20:37:40 +0530 Subject: [PATCH 09/85] powerpc/ptdump: Fix walk_vmemmap() to also print first vmemmap entry Currently walk_vmemmap() skips the first vmemmap entry pointed to by vmemmap_list pointer itself. Fix that. With the fix applied the vmemmap entry at 0xc00c000000000000 for hash is displayed: $ cat /sys/kernel/debug/kernel_hash_pagetable ... 0xc00c000000010000: AVPN:cd7bd4e0000 ssize: 1T ... 0xc00c000000000000: AVPN:cd7bd4e0000 ssize: 1T ... Signed-off-by: Ritesh Harjani (IBM) [mpe: Tweak change log wording and add example output] Signed-off-by: Michael Ellerman Link: https://msgid.link/a19ee3dc2b304d39da364a592d5cd167449f8c4a.1713365940.git.ritesh.list@gmail.com --- arch/powerpc/mm/ptdump/hashpagetable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index 9a601587836b..a6baa6166d94 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -491,7 +491,7 @@ static void walk_vmemmap(struct pg_state *st) * Traverse the vmemmaped memory and dump pages that are in the hash * pagetable. */ - while (ptr->list) { + while (ptr) { hpte_find(st, ptr->virt_addr, mmu_vmemmap_psize); ptr = ptr->list; } From 5ca096161cdccfa328acf6704a4615528471d309 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (IBM)" Date: Wed, 3 Apr 2024 14:06:09 +0530 Subject: [PATCH 10/85] powerpc/mm: Align memory_limit value specified using mem= kernel parameter The value specified for the memory limit is used to set a restriction on memory usage. It is important to ensure that this restriction is within the linear map kernel address space range. The hash page table translation uses a 16MB page size to map the kernel linear map address space. htab_bolt_mapping() function aligns down the size of the range while mapping kernel linear address space. Since the memblock limit is enforced very early during boot, before we can detect the type of memory translation (radix vs hash), we align the memory limit value specified as a kernel parameter to 16MB. This alignment value will work for both hash and radix translations. Signed-off-by: Aneesh Kumar K.V (IBM) Acked-by: Joel Savitz Signed-off-by: Michael Ellerman Link: https://msgid.link/20240403083611.172833-1-aneesh.kumar@kernel.org --- arch/powerpc/kernel/prom.c | 7 +++++-- arch/powerpc/kernel/prom_init.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index cd8d8883de90..7451bedad1f4 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -846,8 +846,11 @@ void __init early_init_devtree(void *params) reserve_crashkernel(); early_reserve_mem(); - /* Ensure that total memory size is page-aligned. */ - limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); + if (memory_limit > memblock_phys_mem_size()) + memory_limit = 0; + + /* Align down to 16 MB which is large page size with hash page translation */ + limit = ALIGN_DOWN(memory_limit ?: memblock_phys_mem_size(), SZ_16M); memblock_enforce_memory_limit(limit); #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_4K_PAGES) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 0ef358285337..fbb68fc28ed3 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -817,8 +817,8 @@ static void __init early_cmdline_parse(void) opt += 4; prom_memory_limit = prom_memparse(opt, (const char **)&opt); #ifdef CONFIG_PPC64 - /* Align to 16 MB == size of ppc64 large page */ - prom_memory_limit = ALIGN(prom_memory_limit, 0x1000000); + /* Align down to 16 MB which is large page size with hash page translation */ + prom_memory_limit = ALIGN_DOWN(prom_memory_limit, SZ_16M); #endif } From f94f5ac07983cb53de0c964f5428366c19e81993 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (IBM)" Date: Wed, 3 Apr 2024 14:06:10 +0530 Subject: [PATCH 11/85] powerpc/fadump: Don't update the user-specified memory limit If the user specifies the memory limit, the kernel should honor it such that all allocation and reservations are made within the memory limit specified. fadump was breaking that rule. Remove the code which updates the memory limit such that fadump reservations are done within the limit specified. Signed-off-by: Aneesh Kumar K.V (IBM) Signed-off-by: Michael Ellerman Link: https://msgid.link/20240403083611.172833-2-aneesh.kumar@kernel.org --- arch/powerpc/kernel/fadump.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d14eda1e8589..4e768d93c6d4 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -573,22 +573,6 @@ int __init fadump_reserve_mem(void) } } - /* - * Calculate the memory boundary. - * If memory_limit is less than actual memory boundary then reserve - * the memory for fadump beyond the memory_limit and adjust the - * memory_limit accordingly, so that the running kernel can run with - * specified memory_limit. - */ - if (memory_limit && memory_limit < memblock_end_of_DRAM()) { - size = get_fadump_area_size(); - if ((memory_limit + size) < memblock_end_of_DRAM()) - memory_limit += size; - else - memory_limit = memblock_end_of_DRAM(); - printk(KERN_INFO "Adjusted memory_limit for firmware-assisted" - " dump, now %#016llx\n", memory_limit); - } if (memory_limit) mem_boundary = memory_limit; else From 5a799af9522641517f6d871d9f56e2658ee7db58 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V (IBM)" Date: Wed, 3 Apr 2024 14:06:11 +0530 Subject: [PATCH 12/85] powerpc/mm: Update the memory limit based on direct mapping restrictions memory limit value specified by the user are further updated such that the value is 16MB aligned. This is because hash translation mode use 16MB as direct mapping page size. Make sure we update the global variable 'memory_limit' with the 16MB aligned value such that all kernel components will see the new aligned value of the memory limit. Signed-off-by: Aneesh Kumar K.V (IBM) Signed-off-by: Michael Ellerman Link: https://msgid.link/20240403083611.172833-3-aneesh.kumar@kernel.org --- arch/powerpc/kernel/prom.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 7451bedad1f4..b8f764453eaa 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -779,7 +779,6 @@ static inline void save_fscr_to_task(void) {} void __init early_init_devtree(void *params) { - phys_addr_t limit; DBG(" -> early_init_devtree(%px)\n", params); @@ -850,8 +849,8 @@ void __init early_init_devtree(void *params) memory_limit = 0; /* Align down to 16 MB which is large page size with hash page translation */ - limit = ALIGN_DOWN(memory_limit ?: memblock_phys_mem_size(), SZ_16M); - memblock_enforce_memory_limit(limit); + memory_limit = ALIGN_DOWN(memory_limit ?: memblock_phys_mem_size(), SZ_16M); + memblock_enforce_memory_limit(memory_limit); #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_4K_PAGES) if (!early_radix_enabled()) From 118005713e35a1893c6ee47ab2926cca277737de Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:08 +0530 Subject: [PATCH 13/85] crash: forward memory_notify arg to arch crash hotplug handler In the event of memory hotplug or online/offline events, the crash memory hotplug notifier `crash_memhp_notifier()` receives a `memory_notify` object but doesn't forward that object to the generic and architecture-specific crash hotplug handler. The `memory_notify` object contains the starting PFN (Page Frame Number) and the number of pages in the hot-removed memory. This information is necessary for architectures like PowerPC to update/recreate the kdump image, specifically `elfcorehdr`. So update the function signature of `crash_handle_hotplug_event()` and `arch_crash_handle_hotplug_event()` to accept the `memory_notify` object as an argument from crash memory hotplug notifier. Since no such object is available in the case of CPU hotplug event, the crash CPU hotplug notifier `crash_cpuhp_online()` passes NULL to the crash hotplug handler. Signed-off-by: Sourabh Jain Acked-by: Baoquan He Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-2-sourabhjain@linux.ibm.com --- arch/x86/include/asm/kexec.h | 2 +- arch/x86/kernel/crash.c | 4 +++- include/linux/crash_core.h | 2 +- kernel/crash_core.c | 14 +++++++------- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 91ca9a9ee3a2..cb1320ebbc23 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -207,7 +207,7 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image); extern void kdump_nmi_shootdown_cpus(void); #ifdef CONFIG_CRASH_HOTPLUG -void arch_crash_handle_hotplug_event(struct kimage *image); +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index e74d0c4286c1..2a682fe86352 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -432,10 +432,12 @@ unsigned int arch_crash_get_elfcorehdr_size(void) /** * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes * @image: a pointer to kexec_crash_image + * @arg: struct memory_notify handler for memory hotplug case and + * NULL for CPU hotplug case. * * Prepare the new elfcorehdr and replace the existing elfcorehdr. */ -void arch_crash_handle_hotplug_event(struct kimage *image) +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { void *elfbuf = NULL, *old_elfcorehdr; unsigned long nr_mem_ranges; diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index d33352c2e386..647e928efee8 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -37,7 +37,7 @@ static inline void arch_kexec_unprotect_crashkres(void) { } #ifndef arch_crash_handle_hotplug_event -static inline void arch_crash_handle_hotplug_event(struct kimage *image) { } +static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } #endif int crash_check_update_elfcorehdr(void); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 78b5dc7cee3a..70fa8111a9d6 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -534,7 +534,7 @@ int crash_check_update_elfcorehdr(void) * list of segments it checks (since the elfcorehdr changes and thus * would require an update to purgatory itself to update the digest). */ -static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) +static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, void *arg) { struct kimage *image; @@ -596,7 +596,7 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu) image->hp_action = hp_action; /* Now invoke arch-specific update handler */ - arch_crash_handle_hotplug_event(image); + arch_crash_handle_hotplug_event(image, arg); /* No longer handling a hotplug event */ image->hp_action = KEXEC_CRASH_HP_NONE; @@ -612,17 +612,17 @@ out: crash_hotplug_unlock(); } -static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *v) +static int crash_memhp_notifier(struct notifier_block *nb, unsigned long val, void *arg) { switch (val) { case MEM_ONLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; case MEM_OFFLINE: crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_MEMORY, - KEXEC_CRASH_HP_INVALID_CPU); + KEXEC_CRASH_HP_INVALID_CPU, arg); break; } return NOTIFY_OK; @@ -635,13 +635,13 @@ static struct notifier_block crash_memhp_nb = { static int crash_cpuhp_online(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_ADD_CPU, cpu, NULL); return 0; } static int crash_cpuhp_offline(unsigned int cpu) { - crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu); + crash_handle_hotplug_event(KEXEC_CRASH_HP_REMOVE_CPU, cpu, NULL); return 0; } From 79365026f86948b52c3cb7bf099dded92c559b4c Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:09 +0530 Subject: [PATCH 14/85] crash: add a new kexec flag for hotplug support Commit a72bbec70da2 ("crash: hotplug support for kexec_load()") introduced a new kexec flag, `KEXEC_UPDATE_ELFCOREHDR`. Kexec tool uses this flag to indicate to the kernel that it is safe to modify the elfcorehdr of the kdump image loaded using the kexec_load system call. However, it is possible that architectures may need to update kexec segments other then elfcorehdr. For example, FDT (Flatten Device Tree) on PowerPC. Introducing a new kexec flag for every new kexec segment may not be a good solution. Hence, a generic kexec flag bit, `KEXEC_CRASH_HOTPLUG_SUPPORT`, is introduced to share the CPU/Memory hotplug support intent between the kexec tool and the kernel for the kexec_load system call. Now we have two kexec flags that enables crash hotplug support for kexec_load system call. First is KEXEC_UPDATE_ELFCOREHDR (only used in x86), and second is KEXEC_CRASH_HOTPLUG_SUPPORT (for all architectures). To simplify the process of finding and reporting the crash hotplug support the following changes are introduced. 1. Define arch specific function to process the kexec flags and determine crash hotplug support 2. Rename the @update_elfcorehdr member of struct kimage to @hotplug_support and populate it for both kexec_load and kexec_file_load syscalls, because architecture can update more than one kexec segment 3. Let generic function crash_check_hotplug_support report hotplug support for loaded kdump image based on value of @hotplug_support To bring the x86 crash hotplug support in line with the above points, the following changes have been made: - Introduce the arch_crash_hotplug_support function to process kexec flags and determine crash hotplug support - Remove the arch_crash_hotplug_[cpu|memory]_support functions Signed-off-by: Sourabh Jain Acked-by: Baoquan He Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-3-sourabhjain@linux.ibm.com --- arch/x86/include/asm/kexec.h | 11 ++--------- arch/x86/kernel/crash.c | 28 +++++++++++++++++----------- drivers/base/cpu.c | 2 +- drivers/base/memory.c | 2 +- include/linux/crash_core.h | 13 ++++++------- include/linux/kexec.h | 11 +++++++---- include/uapi/linux/kexec.h | 1 + kernel/crash_core.c | 15 ++++++--------- kernel/kexec.c | 4 ++-- kernel/kexec_file.c | 5 +++++ 10 files changed, 48 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index cb1320ebbc23..ae5482a2f0ca 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -210,15 +210,8 @@ extern void kdump_nmi_shootdown_cpus(void); void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event -#ifdef CONFIG_HOTPLUG_CPU -int arch_crash_hotplug_cpu_support(void); -#define crash_hotplug_cpu_support arch_crash_hotplug_cpu_support -#endif - -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_crash_hotplug_memory_support(void); -#define crash_hotplug_memory_support arch_crash_hotplug_memory_support -#endif +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags); +#define arch_crash_hotplug_support arch_crash_hotplug_support unsigned int arch_crash_get_elfcorehdr_size(void); #define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2a682fe86352..f06501445cd9 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -402,20 +402,26 @@ int crash_load_segments(struct kimage *image) #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt -/* These functions provide the value for the sysfs crash_hotplug nodes */ -#ifdef CONFIG_HOTPLUG_CPU -int arch_crash_hotplug_cpu_support(void) +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) { - return crash_check_update_elfcorehdr(); -} -#endif -#ifdef CONFIG_MEMORY_HOTPLUG -int arch_crash_hotplug_memory_support(void) -{ - return crash_check_update_elfcorehdr(); -} +#ifdef CONFIG_KEXEC_FILE + if (image->file_mode) + return 1; #endif + /* + * Initially, crash hotplug support for kexec_load was added + * with the KEXEC_UPDATE_ELFCOREHDR flag. Later, this + * functionality was expanded to accommodate multiple kexec + * segment updates, leading to the introduction of the + * KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag bit. Consequently, + * when the kexec tool sends either of these flags, it indicates + * that the required kexec segment (elfcorehdr) is excluded from + * the SHA calculation. + */ + return (kexec_flags & KEXEC_UPDATE_ELFCOREHDR || + kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT); +} unsigned int arch_crash_get_elfcorehdr_size(void) { diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 56fba44ba391..c61ecb0c2ae2 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -306,7 +306,7 @@ static ssize_t crash_hotplug_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sysfs_emit(buf, "%d\n", crash_hotplug_cpu_support()); + return sysfs_emit(buf, "%d\n", crash_check_hotplug_support()); } static DEVICE_ATTR_ADMIN_RO(crash_hotplug); #endif diff --git a/drivers/base/memory.c b/drivers/base/memory.c index c0436f46cfb7..67858eeb92ed 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -535,7 +535,7 @@ static DEVICE_ATTR_RW(auto_online_blocks); static ssize_t crash_hotplug_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sysfs_emit(buf, "%d\n", crash_hotplug_memory_support()); + return sysfs_emit(buf, "%d\n", crash_check_hotplug_support()); } static DEVICE_ATTR_RO(crash_hotplug); #endif diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index 647e928efee8..44305336314e 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -40,14 +40,13 @@ static inline void arch_kexec_unprotect_crashkres(void) { } static inline void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { } #endif -int crash_check_update_elfcorehdr(void); +int crash_check_hotplug_support(void); -#ifndef crash_hotplug_cpu_support -static inline int crash_hotplug_cpu_support(void) { return 0; } -#endif - -#ifndef crash_hotplug_memory_support -static inline int crash_hotplug_memory_support(void) { return 0; } +#ifndef arch_crash_hotplug_support +static inline int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) +{ + return 0; +} #endif #ifndef crash_get_elfcorehdr_size diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 060835bb82d5..5b93a5767413 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -319,8 +319,10 @@ struct kimage { /* If set, we are using file mode kexec syscall */ unsigned int file_mode:1; #ifdef CONFIG_CRASH_HOTPLUG - /* If set, allow changes to elfcorehdr of kexec_load'd image */ - unsigned int update_elfcorehdr:1; + /* If set, it is safe to update kexec segments that are + * excluded from SHA calculation. + */ + unsigned int hotplug_support:1; #endif #ifdef ARCH_HAS_KIMAGE_ARCH @@ -391,9 +393,10 @@ bool kexec_load_permitted(int kexec_image_type); /* List of defined/legal kexec flags */ #ifndef CONFIG_KEXEC_JUMP -#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR) +#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT) #else -#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR) +#define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \ + KEXEC_CRASH_HOTPLUG_SUPPORT) #endif /* List of defined/legal kexec file flags */ diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index c17bb096ea68..5ae1741ea8ea 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -13,6 +13,7 @@ #define KEXEC_ON_CRASH 0x00000001 #define KEXEC_PRESERVE_CONTEXT 0x00000002 #define KEXEC_UPDATE_ELFCOREHDR 0x00000004 +#define KEXEC_CRASH_HOTPLUG_SUPPORT 0x00000008 #define KEXEC_ARCH_MASK 0xffff0000 /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 70fa8111a9d6..394db3ebe835 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -493,10 +493,10 @@ static DEFINE_MUTEX(__crash_hotplug_lock); /* * This routine utilized when the crash_hotplug sysfs node is read. - * It reflects the kernel's ability/permission to update the crash - * elfcorehdr directly. + * It reflects the kernel's ability/permission to update the kdump + * image directly. */ -int crash_check_update_elfcorehdr(void) +int crash_check_hotplug_support(void) { int rc = 0; @@ -508,10 +508,7 @@ int crash_check_update_elfcorehdr(void) return 0; } if (kexec_crash_image) { - if (kexec_crash_image->file_mode) - rc = 1; - else - rc = kexec_crash_image->update_elfcorehdr; + rc = kexec_crash_image->hotplug_support; } /* Release lock now that update complete */ kexec_unlock(); @@ -552,8 +549,8 @@ static void crash_handle_hotplug_event(unsigned int hp_action, unsigned int cpu, image = kexec_crash_image; - /* Check that updating elfcorehdr is permitted */ - if (!(image->file_mode || image->update_elfcorehdr)) + /* Check that kexec segments update is permitted */ + if (!image->hotplug_support) goto out; if (hp_action == KEXEC_CRASH_HP_ADD_CPU || diff --git a/kernel/kexec.c b/kernel/kexec.c index bab542fc1463..a6b3f96bb50c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -135,8 +135,8 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, image->preserve_context = 1; #ifdef CONFIG_CRASH_HOTPLUG - if (flags & KEXEC_UPDATE_ELFCOREHDR) - image->update_elfcorehdr = 1; + if ((flags & KEXEC_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; #endif ret = machine_kexec_prepare(image); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 2d1db05fbf04..3d64290d24c9 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -376,6 +376,11 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, if (ret) goto out; +#ifdef CONFIG_CRASH_HOTPLUG + if ((flags & KEXEC_FILE_ON_CRASH) && arch_crash_hotplug_support(image, flags)) + image->hotplug_support = 1; +#endif + ret = machine_kexec_prepare(image); if (ret) goto out; From f5f0da5a7b18fab383bac92044fd8f4f288c9d38 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:10 +0530 Subject: [PATCH 15/85] powerpc/kexec: move *_memory_ranges functions to ranges.c Move the following functions form kexec/{file_load_64.c => ranges.c} and make them public so that components other than KEXEC_FILE can also use these functions. 1. get_exclude_memory_ranges 2. get_reserved_memory_ranges 3. get_crash_memory_ranges 4. get_usable_memory_ranges Later in the series get_crash_memory_ranges function is utilized for in-kernel updates to kdump image during CPU/Memory hotplug or online/offline events for both kexec_load and kexec_file_load syscalls. Since the above functions are moved to ranges.c, some of the helper functions in ranges.c are no longer required to be public. Mark them as static and removed them from kexec_ranges.h header file. Finally, remove the CONFIG_KEXEC_FILE build dependency for range.c because it is required for other config, such as CONFIG_CRASH_DUMP. No functional changes are intended. Signed-off-by: Sourabh Jain Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-4-sourabhjain@linux.ibm.com --- arch/powerpc/include/asm/kexec_ranges.h | 19 +- arch/powerpc/kexec/Makefile | 4 +- arch/powerpc/kexec/file_load_64.c | 190 -------------------- arch/powerpc/kexec/ranges.c | 227 +++++++++++++++++++++++- 4 files changed, 224 insertions(+), 216 deletions(-) diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index f83866a19e87..8489e844b447 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -7,19 +7,8 @@ void sort_memory_ranges(struct crash_mem *mrngs, bool merge); struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges); int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); -int add_tce_mem_ranges(struct crash_mem **mem_ranges); -int add_initrd_mem_range(struct crash_mem **mem_ranges); -#ifdef CONFIG_PPC_64S_HASH_MMU -int add_htab_mem_range(struct crash_mem **mem_ranges); -#else -static inline int add_htab_mem_range(struct crash_mem **mem_ranges) -{ - return 0; -} -#endif -int add_kernel_mem_range(struct crash_mem **mem_ranges); -int add_rtas_mem_range(struct crash_mem **mem_ranges); -int add_opal_mem_range(struct crash_mem **mem_ranges); -int add_reserved_mem_ranges(struct crash_mem **mem_ranges); - +int get_exclude_memory_ranges(struct crash_mem **mem_ranges); +int get_reserved_memory_ranges(struct crash_mem **mem_ranges); +int get_crash_memory_ranges(struct crash_mem **mem_ranges); +int get_usable_memory_ranges(struct crash_mem **mem_ranges); #endif /* _ASM_POWERPC_KEXEC_RANGES_H */ diff --git a/arch/powerpc/kexec/Makefile b/arch/powerpc/kexec/Makefile index 8e469c4da3f8..470eb0453e17 100644 --- a/arch/powerpc/kexec/Makefile +++ b/arch/powerpc/kexec/Makefile @@ -3,11 +3,11 @@ # Makefile for the linux kernel. # -obj-y += core.o core_$(BITS).o +obj-y += core.o core_$(BITS).o ranges.o obj-$(CONFIG_PPC32) += relocate_32.o -obj-$(CONFIG_KEXEC_FILE) += file_load.o ranges.o file_load_$(BITS).o elf_$(BITS).o +obj-$(CONFIG_KEXEC_FILE) += file_load.o file_load_$(BITS).o elf_$(BITS).o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o obj-$(CONFIG_CRASH_DUMP) += crash.o diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 1bc65de6174f..6a01f62b8fcf 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -47,83 +47,6 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL }; -/** - * get_exclude_memory_ranges - Get exclude memory ranges. This list includes - * regions like opal/rtas, tce-table, initrd, - * kernel, htab which should be avoided while - * setting up kexec load segments. - * @mem_ranges: Range list to add the memory ranges to. - * - * Returns 0 on success, negative errno on error. - */ -static int get_exclude_memory_ranges(struct crash_mem **mem_ranges) -{ - int ret; - - ret = add_tce_mem_ranges(mem_ranges); - if (ret) - goto out; - - ret = add_initrd_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_htab_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_kernel_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_rtas_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_opal_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_reserved_mem_ranges(mem_ranges); - if (ret) - goto out; - - /* exclude memory ranges should be sorted for easy lookup */ - sort_memory_ranges(*mem_ranges, true); -out: - if (ret) - pr_err("Failed to setup exclude memory ranges\n"); - return ret; -} - -/** - * get_reserved_memory_ranges - Get reserve memory ranges. This list includes - * memory regions that should be added to the - * memory reserve map to ensure the region is - * protected from any mischief. - * @mem_ranges: Range list to add the memory ranges to. - * - * Returns 0 on success, negative errno on error. - */ -static int get_reserved_memory_ranges(struct crash_mem **mem_ranges) -{ - int ret; - - ret = add_rtas_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_tce_mem_ranges(mem_ranges); - if (ret) - goto out; - - ret = add_reserved_mem_ranges(mem_ranges); -out: - if (ret) - pr_err("Failed to setup reserved memory ranges\n"); - return ret; -} - /** * __locate_mem_hole_top_down - Looks top down for a large enough memory hole * in the memory regions between buf_min & buf_max @@ -322,119 +245,6 @@ static int locate_mem_hole_bottom_up_ppc64(struct kexec_buf *kbuf, } #ifdef CONFIG_CRASH_DUMP -/** - * get_usable_memory_ranges - Get usable memory ranges. This list includes - * regions like crashkernel, opal/rtas & tce-table, - * that kdump kernel could use. - * @mem_ranges: Range list to add the memory ranges to. - * - * Returns 0 on success, negative errno on error. - */ -static int get_usable_memory_ranges(struct crash_mem **mem_ranges) -{ - int ret; - - /* - * Early boot failure observed on guests when low memory (first memory - * block?) is not added to usable memory. So, add [0, crashk_res.end] - * instead of [crashk_res.start, crashk_res.end] to workaround it. - * Also, crashed kernel's memory must be added to reserve map to - * avoid kdump kernel from using it. - */ - ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1); - if (ret) - goto out; - - ret = add_rtas_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_opal_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_tce_mem_ranges(mem_ranges); -out: - if (ret) - pr_err("Failed to setup usable memory ranges\n"); - return ret; -} - -/** - * get_crash_memory_ranges - Get crash memory ranges. This list includes - * first/crashing kernel's memory regions that - * would be exported via an elfcore. - * @mem_ranges: Range list to add the memory ranges to. - * - * Returns 0 on success, negative errno on error. - */ -static int get_crash_memory_ranges(struct crash_mem **mem_ranges) -{ - phys_addr_t base, end; - struct crash_mem *tmem; - u64 i; - int ret; - - for_each_mem_range(i, &base, &end) { - u64 size = end - base; - - /* Skip backup memory region, which needs a separate entry */ - if (base == BACKUP_SRC_START) { - if (size > BACKUP_SRC_SIZE) { - base = BACKUP_SRC_END + 1; - size -= BACKUP_SRC_SIZE; - } else - continue; - } - - ret = add_mem_range(mem_ranges, base, size); - if (ret) - goto out; - - /* Try merging adjacent ranges before reallocation attempt */ - if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges) - sort_memory_ranges(*mem_ranges, true); - } - - /* Reallocate memory ranges if there is no space to split ranges */ - tmem = *mem_ranges; - if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) { - tmem = realloc_mem_ranges(mem_ranges); - if (!tmem) - goto out; - } - - /* Exclude crashkernel region */ - ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end); - if (ret) - goto out; - - /* - * FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL - * regions are exported to save their context at the time of - * crash, they should actually be backed up just like the - * first 64K bytes of memory. - */ - ret = add_rtas_mem_range(mem_ranges); - if (ret) - goto out; - - ret = add_opal_mem_range(mem_ranges); - if (ret) - goto out; - - /* create a separate program header for the backup region */ - ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE); - if (ret) - goto out; - - sort_memory_ranges(*mem_ranges, false); -out: - if (ret) - pr_err("Failed to setup crash memory ranges\n"); - return ret; -} - /** * check_realloc_usable_mem - Reallocate buffer if it can't accommodate entries * @um_info: Usable memory buffer and ranges info. diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c index 33b780049aaf..b1f4267d9b17 100644 --- a/arch/powerpc/kexec/ranges.c +++ b/arch/powerpc/kexec/ranges.c @@ -20,9 +20,13 @@ #include #include #include +#include +#include #include #include +#include +#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP) /** * get_max_nr_ranges - Get the max no. of ranges crash_mem structure * could hold, given the size allocated for it. @@ -234,13 +238,16 @@ int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size) return __add_mem_range(mem_ranges, base, size); } +#endif /* CONFIG_KEXEC_FILE || CONFIG_CRASH_DUMP */ + +#ifdef CONFIG_KEXEC_FILE /** * add_tce_mem_ranges - Adds tce-table range to the given memory ranges list. * @mem_ranges: Range list to add the memory range(s) to. * * Returns 0 on success, negative errno on error. */ -int add_tce_mem_ranges(struct crash_mem **mem_ranges) +static int add_tce_mem_ranges(struct crash_mem **mem_ranges) { struct device_node *dn = NULL; int ret = 0; @@ -279,7 +286,7 @@ int add_tce_mem_ranges(struct crash_mem **mem_ranges) * * Returns 0 on success, negative errno on error. */ -int add_initrd_mem_range(struct crash_mem **mem_ranges) +static int add_initrd_mem_range(struct crash_mem **mem_ranges) { u64 base, end; int ret; @@ -296,7 +303,6 @@ int add_initrd_mem_range(struct crash_mem **mem_ranges) return ret; } -#ifdef CONFIG_PPC_64S_HASH_MMU /** * add_htab_mem_range - Adds htab range to the given memory ranges list, * if it exists @@ -304,14 +310,18 @@ int add_initrd_mem_range(struct crash_mem **mem_ranges) * * Returns 0 on success, negative errno on error. */ -int add_htab_mem_range(struct crash_mem **mem_ranges) +static int add_htab_mem_range(struct crash_mem **mem_ranges) { + +#ifdef CONFIG_PPC_64S_HASH_MMU if (!htab_address) return 0; return add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes); -} +#else + return 0; #endif +} /** * add_kernel_mem_range - Adds kernel text region to the given @@ -320,18 +330,20 @@ int add_htab_mem_range(struct crash_mem **mem_ranges) * * Returns 0 on success, negative errno on error. */ -int add_kernel_mem_range(struct crash_mem **mem_ranges) +static int add_kernel_mem_range(struct crash_mem **mem_ranges) { return add_mem_range(mem_ranges, 0, __pa(_end)); } +#endif /* CONFIG_KEXEC_FILE */ +#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP) /** * add_rtas_mem_range - Adds RTAS region to the given memory ranges list. * @mem_ranges: Range list to add the memory range to. * * Returns 0 on success, negative errno on error. */ -int add_rtas_mem_range(struct crash_mem **mem_ranges) +static int add_rtas_mem_range(struct crash_mem **mem_ranges) { struct device_node *dn; u32 base, size; @@ -356,7 +368,7 @@ int add_rtas_mem_range(struct crash_mem **mem_ranges) * * Returns 0 on success, negative errno on error. */ -int add_opal_mem_range(struct crash_mem **mem_ranges) +static int add_opal_mem_range(struct crash_mem **mem_ranges) { struct device_node *dn; u64 base, size; @@ -374,7 +386,9 @@ int add_opal_mem_range(struct crash_mem **mem_ranges) of_node_put(dn); return ret; } +#endif /* CONFIG_KEXEC_FILE || CONFIG_CRASH_DUMP */ +#ifdef CONFIG_KEXEC_FILE /** * add_reserved_mem_ranges - Adds "/reserved-ranges" regions exported by f/w * to the given memory ranges list. @@ -382,7 +396,7 @@ int add_opal_mem_range(struct crash_mem **mem_ranges) * * Returns 0 on success, negative errno on error. */ -int add_reserved_mem_ranges(struct crash_mem **mem_ranges) +static int add_reserved_mem_ranges(struct crash_mem **mem_ranges) { int n_mem_addr_cells, n_mem_size_cells, i, len, cells, ret = 0; struct device_node *root = of_find_node_by_path("/"); @@ -412,3 +426,198 @@ int add_reserved_mem_ranges(struct crash_mem **mem_ranges) return ret; } + +/** + * get_reserved_memory_ranges - Get reserve memory ranges. This list includes + * memory regions that should be added to the + * memory reserve map to ensure the region is + * protected from any mischief. + * @mem_ranges: Range list to add the memory ranges to. + * + * Returns 0 on success, negative errno on error. + */ +int get_reserved_memory_ranges(struct crash_mem **mem_ranges) +{ + int ret; + + ret = add_rtas_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_tce_mem_ranges(mem_ranges); + if (ret) + goto out; + + ret = add_reserved_mem_ranges(mem_ranges); +out: + if (ret) + pr_err("Failed to setup reserved memory ranges\n"); + return ret; +} + +/** + * get_exclude_memory_ranges - Get exclude memory ranges. This list includes + * regions like opal/rtas, tce-table, initrd, + * kernel, htab which should be avoided while + * setting up kexec load segments. + * @mem_ranges: Range list to add the memory ranges to. + * + * Returns 0 on success, negative errno on error. + */ +int get_exclude_memory_ranges(struct crash_mem **mem_ranges) +{ + int ret; + + ret = add_tce_mem_ranges(mem_ranges); + if (ret) + goto out; + + ret = add_initrd_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_htab_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_kernel_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_rtas_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_opal_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_reserved_mem_ranges(mem_ranges); + if (ret) + goto out; + + /* exclude memory ranges should be sorted for easy lookup */ + sort_memory_ranges(*mem_ranges, true); +out: + if (ret) + pr_err("Failed to setup exclude memory ranges\n"); + return ret; +} + +#ifdef CONFIG_CRASH_DUMP +/** + * get_usable_memory_ranges - Get usable memory ranges. This list includes + * regions like crashkernel, opal/rtas & tce-table, + * that kdump kernel could use. + * @mem_ranges: Range list to add the memory ranges to. + * + * Returns 0 on success, negative errno on error. + */ +int get_usable_memory_ranges(struct crash_mem **mem_ranges) +{ + int ret; + + /* + * Early boot failure observed on guests when low memory (first memory + * block?) is not added to usable memory. So, add [0, crashk_res.end] + * instead of [crashk_res.start, crashk_res.end] to workaround it. + * Also, crashed kernel's memory must be added to reserve map to + * avoid kdump kernel from using it. + */ + ret = add_mem_range(mem_ranges, 0, crashk_res.end + 1); + if (ret) + goto out; + + ret = add_rtas_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_opal_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_tce_mem_ranges(mem_ranges); +out: + if (ret) + pr_err("Failed to setup usable memory ranges\n"); + return ret; +} +#endif /* CONFIG_CRASH_DUMP */ +#endif /* CONFIG_KEXEC_FILE */ + +#ifdef CONFIG_CRASH_DUMP +/** + * get_crash_memory_ranges - Get crash memory ranges. This list includes + * first/crashing kernel's memory regions that + * would be exported via an elfcore. + * @mem_ranges: Range list to add the memory ranges to. + * + * Returns 0 on success, negative errno on error. + */ +int get_crash_memory_ranges(struct crash_mem **mem_ranges) +{ + phys_addr_t base, end; + struct crash_mem *tmem; + u64 i; + int ret; + + for_each_mem_range(i, &base, &end) { + u64 size = end - base; + + /* Skip backup memory region, which needs a separate entry */ + if (base == BACKUP_SRC_START) { + if (size > BACKUP_SRC_SIZE) { + base = BACKUP_SRC_END + 1; + size -= BACKUP_SRC_SIZE; + } else + continue; + } + + ret = add_mem_range(mem_ranges, base, size); + if (ret) + goto out; + + /* Try merging adjacent ranges before reallocation attempt */ + if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges) + sort_memory_ranges(*mem_ranges, true); + } + + /* Reallocate memory ranges if there is no space to split ranges */ + tmem = *mem_ranges; + if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) { + tmem = realloc_mem_ranges(mem_ranges); + if (!tmem) + goto out; + } + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + /* + * FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL + * regions are exported to save their context at the time of + * crash, they should actually be backed up just like the + * first 64K bytes of memory. + */ + ret = add_rtas_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_opal_mem_range(mem_ranges); + if (ret) + goto out; + + /* create a separate program header for the backup region */ + ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE); + if (ret) + goto out; + + sort_memory_ranges(*mem_ranges, false); +out: + if (ret) + pr_err("Failed to setup crash memory ranges\n"); + return ret; +} +#endif /* CONFIG_CRASH_DUMP */ From 0857beff9c1ec8bb421a8b7a721da0f34cc886c0 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:11 +0530 Subject: [PATCH 16/85] powerpc/kexec: make the update_cpus_node() function public Move the update_cpus_node() from kexec/{file_load_64.c => core_64.c} to allow other kexec components to use it. Later in the series, this function is used for in-kernel updates to the kdump image during CPU/memory hotplug or online/offline events for both kexec_load and kexec_file_load syscalls. No functional changes are intended. Signed-off-by: Sourabh Jain Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-5-sourabhjain@linux.ibm.com --- arch/powerpc/include/asm/kexec.h | 4 ++ arch/powerpc/kexec/core_64.c | 91 +++++++++++++++++++++++++++++++ arch/powerpc/kexec/file_load_64.c | 87 ----------------------------- 3 files changed, 95 insertions(+), 87 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index fdb90e24dc74..d9ff4d0e392d 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -185,6 +185,10 @@ static inline void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) #endif /* CONFIG_CRASH_DUMP */ +#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP) +int update_cpus_node(void *fdt); +#endif + #ifdef CONFIG_PPC_BOOK3S_64 #include #endif diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 762e4d09aacf..85050be08a23 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -30,6 +31,7 @@ #include #include #include +#include int machine_kexec_prepare(struct kimage *image) { @@ -419,3 +421,92 @@ static int __init export_htab_values(void) } late_initcall(export_htab_values); #endif /* CONFIG_PPC_64S_HASH_MMU */ + +#if defined(CONFIG_KEXEC_FILE) || defined(CONFIG_CRASH_DUMP) +/** + * add_node_props - Reads node properties from device node structure and add + * them to fdt. + * @fdt: Flattened device tree of the kernel + * @node_offset: offset of the node to add a property at + * @dn: device node pointer + * + * Returns 0 on success, negative errno on error. + */ +static int add_node_props(void *fdt, int node_offset, const struct device_node *dn) +{ + int ret = 0; + struct property *pp; + + if (!dn) + return -EINVAL; + + for_each_property_of_node(dn, pp) { + ret = fdt_setprop(fdt, node_offset, pp->name, pp->value, pp->length); + if (ret < 0) { + pr_err("Unable to add %s property: %s\n", pp->name, fdt_strerror(ret)); + return ret; + } + } + return ret; +} + +/** + * update_cpus_node - Update cpus node of flattened device tree using of_root + * device node. + * @fdt: Flattened device tree of the kernel. + * + * Returns 0 on success, negative errno on error. + */ +int update_cpus_node(void *fdt) +{ + struct device_node *cpus_node, *dn; + int cpus_offset, cpus_subnode_offset, ret = 0; + + cpus_offset = fdt_path_offset(fdt, "/cpus"); + if (cpus_offset < 0 && cpus_offset != -FDT_ERR_NOTFOUND) { + pr_err("Malformed device tree: error reading /cpus node: %s\n", + fdt_strerror(cpus_offset)); + return cpus_offset; + } + + if (cpus_offset > 0) { + ret = fdt_del_node(fdt, cpus_offset); + if (ret < 0) { + pr_err("Error deleting /cpus node: %s\n", fdt_strerror(ret)); + return -EINVAL; + } + } + + /* Add cpus node to fdt */ + cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus"); + if (cpus_offset < 0) { + pr_err("Error creating /cpus node: %s\n", fdt_strerror(cpus_offset)); + return -EINVAL; + } + + /* Add cpus node properties */ + cpus_node = of_find_node_by_path("/cpus"); + ret = add_node_props(fdt, cpus_offset, cpus_node); + of_node_put(cpus_node); + if (ret < 0) + return ret; + + /* Loop through all subnodes of cpus and add them to fdt */ + for_each_node_by_type(dn, "cpu") { + cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name); + if (cpus_subnode_offset < 0) { + pr_err("Unable to add %s subnode: %s\n", dn->full_name, + fdt_strerror(cpus_subnode_offset)); + ret = cpus_subnode_offset; + goto out; + } + + ret = add_node_props(fdt, cpus_subnode_offset, dn); + if (ret < 0) + goto out; + } +out: + of_node_put(dn); + return ret; +} +#endif /* CONFIG_KEXEC_FILE || CONFIG_CRASH_DUMP */ diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 6a01f62b8fcf..4b94c31e3172 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -838,93 +838,6 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) return extra_size + kdump_extra_fdt_size_ppc64(image); } -/** - * add_node_props - Reads node properties from device node structure and add - * them to fdt. - * @fdt: Flattened device tree of the kernel - * @node_offset: offset of the node to add a property at - * @dn: device node pointer - * - * Returns 0 on success, negative errno on error. - */ -static int add_node_props(void *fdt, int node_offset, const struct device_node *dn) -{ - int ret = 0; - struct property *pp; - - if (!dn) - return -EINVAL; - - for_each_property_of_node(dn, pp) { - ret = fdt_setprop(fdt, node_offset, pp->name, pp->value, pp->length); - if (ret < 0) { - pr_err("Unable to add %s property: %s\n", pp->name, fdt_strerror(ret)); - return ret; - } - } - return ret; -} - -/** - * update_cpus_node - Update cpus node of flattened device tree using of_root - * device node. - * @fdt: Flattened device tree of the kernel. - * - * Returns 0 on success, negative errno on error. - */ -static int update_cpus_node(void *fdt) -{ - struct device_node *cpus_node, *dn; - int cpus_offset, cpus_subnode_offset, ret = 0; - - cpus_offset = fdt_path_offset(fdt, "/cpus"); - if (cpus_offset < 0 && cpus_offset != -FDT_ERR_NOTFOUND) { - pr_err("Malformed device tree: error reading /cpus node: %s\n", - fdt_strerror(cpus_offset)); - return cpus_offset; - } - - if (cpus_offset > 0) { - ret = fdt_del_node(fdt, cpus_offset); - if (ret < 0) { - pr_err("Error deleting /cpus node: %s\n", fdt_strerror(ret)); - return -EINVAL; - } - } - - /* Add cpus node to fdt */ - cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus"); - if (cpus_offset < 0) { - pr_err("Error creating /cpus node: %s\n", fdt_strerror(cpus_offset)); - return -EINVAL; - } - - /* Add cpus node properties */ - cpus_node = of_find_node_by_path("/cpus"); - ret = add_node_props(fdt, cpus_offset, cpus_node); - of_node_put(cpus_node); - if (ret < 0) - return ret; - - /* Loop through all subnodes of cpus and add them to fdt */ - for_each_node_by_type(dn, "cpu") { - cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name); - if (cpus_subnode_offset < 0) { - pr_err("Unable to add %s subnode: %s\n", dn->full_name, - fdt_strerror(cpus_subnode_offset)); - ret = cpus_subnode_offset; - goto out; - } - - ret = add_node_props(fdt, cpus_subnode_offset, dn); - if (ret < 0) - goto out; - } -out: - of_node_put(dn); - return ret; -} - static int copy_property(void *fdt, int node_offset, const struct device_node *dn, const char *propname) { From b741092d59761b98781fcb4f3f521312ed8d5006 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:12 +0530 Subject: [PATCH 17/85] powerpc/crash: add crash CPU hotplug support Due to CPU/Memory hotplug or online/offline events, the elfcorehdr (which describes the CPUs and memory of the crashed kernel) and FDT (Flattened Device Tree) of kdump image becomes outdated. Consequently, attempting dump collection with an outdated elfcorehdr or FDT can lead to failed or inaccurate dump collection. Going forward, CPU hotplug or online/offline events are referred as CPU/Memory add/remove events. The current solution to address the above issue involves monitoring the CPU/Memory add/remove events in userspace using udev rules and whenever there are changes in CPU and memory resources, the entire kdump image is loaded again. The kdump image includes kernel, initrd, elfcorehdr, FDT, purgatory. Given that only elfcorehdr and FDT get outdated due to CPU/Memory add/remove events, reloading the entire kdump image is inefficient. More importantly, kdump remains inactive for a substantial amount of time until the kdump reload completes. To address the aforementioned issue, commit 247262756121 ("crash: add generic infrastructure for crash hotplug support") added a generic infrastructure that allows architectures to selectively update the kdump image component during CPU or memory add/remove events within the kernel itself. In the event of a CPU or memory add/remove events, the generic crash hotplug event handler, `crash_handle_hotplug_event()`, is triggered. It then acquires the necessary locks to update the kdump image and invokes the architecture-specific crash hotplug handler, `arch_crash_handle_hotplug_event()`, to update the required kdump image components. This patch adds crash hotplug handler for PowerPC and enable support to update the kdump image on CPU add/remove events. Support for memory add/remove events is added in a subsequent patch with the title "powerpc: add crash memory hotplug support" As mentioned earlier, only the elfcorehdr and FDT kdump image components need to be updated in the event of CPU or memory add/remove events. However, on PowerPC architecture crash hotplug handler only updates the FDT to enable crash hotplug support for CPU add/remove events. Here's why. The elfcorehdr on PowerPC is built with possible CPUs, and thus, it does not need an update on CPU add/remove events. On the other hand, the FDT needs to be updated on CPU add events to include the newly added CPU. If the FDT is not updated and the kernel crashes on a newly added CPU, the kdump kernel will fail to boot due to the unavailability of the crashing CPU in the FDT. During the early boot, it is expected that the boot CPU must be a part of the FDT; otherwise, the kernel will raise a BUG and fail to boot. For more information, refer to commit 36ae37e3436b0 ("powerpc: Make boot_cpuid common between 32 and 64-bit"). Since it is okay to have an offline CPU in the kdump FDT, no action is taken in case of CPU removal. There are two system calls, `kexec_file_load` and `kexec_load`, used to load the kdump image. Few changes have been made to ensure kernel can safely update the FDT of kdump image loaded using both system calls. For kexec_file_load syscall the kdump image is prepared in kernel. So to support an increasing number of CPUs, the FDT is constructed with extra buffer space to ensure it can accommodate a possible number of CPU nodes. Additionally, a call to fdt_pack (which trims the unused space once the FDT is prepared) is avoided if this feature is enabled. For the kexec_load syscall, the FDT is updated only if the KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by userspace (kexec tools). When userspace passes this flag to the kernel, it indicates that the FDT is built to accommodate possible CPUs, and the FDT segment is excluded from SHA calculation, making it safe to update. The changes related to this feature are kept under the CRASH_HOTPLUG config, and it is enabled by default. Signed-off-by: Sourabh Jain Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-6-sourabhjain@linux.ibm.com --- arch/powerpc/Kconfig | 4 ++ arch/powerpc/include/asm/kexec.h | 8 +++ arch/powerpc/kexec/crash.c | 103 ++++++++++++++++++++++++++++++ arch/powerpc/kexec/elf_64.c | 3 +- arch/powerpc/kexec/file_load_64.c | 17 +++++ 5 files changed, 134 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1c4be3373686..a1a3b3363008 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -686,6 +686,10 @@ config ARCH_SELECTS_CRASH_DUMP depends on CRASH_DUMP select RELOCATABLE if PPC64 || 44x || PPC_85xx +config ARCH_SUPPORTS_CRASH_HOTPLUG + def_bool y + depends on PPC64 + config FA_DUMP bool "Firmware-assisted dump" depends on CRASH_DUMP && PPC64 && (PPC_RTAS || PPC_POWERNV) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index d9ff4d0e392d..e75970351bcd 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -135,6 +135,14 @@ static inline void crash_setup_regs(struct pt_regs *newregs, ppc_save_regs(newregs); } +#ifdef CONFIG_CRASH_HOTPLUG +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event + +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags); +#define arch_crash_hotplug_support arch_crash_hotplug_support +#endif /* CONFIG_CRASH_HOTPLUG */ + extern int crashing_cpu; extern void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)); extern void crash_ipi_callback(struct pt_regs *regs); diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index ef5c2d25ec39..8938a19af12f 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -392,3 +393,105 @@ void default_machine_crash_shutdown(struct pt_regs *regs) if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(1, 0); } + +#ifdef CONFIG_CRASH_HOTPLUG +#undef pr_fmt +#define pr_fmt(fmt) "crash hp: " fmt + +/** + * get_fdt_index - Loop through the kexec segment array and find + * the index of the FDT segment. + * @image: a pointer to kexec_crash_image + * + * Returns the index of FDT segment in the kexec segment array + * if found; otherwise -1. + */ +static int get_fdt_index(struct kimage *image) +{ + void *ptr; + unsigned long mem; + int i, fdt_index = -1; + + /* Find the FDT segment index in kexec segment array. */ + for (i = 0; i < image->nr_segments; i++) { + mem = image->segment[i].mem; + ptr = __va(mem); + + if (ptr && fdt_magic(ptr) == FDT_MAGIC) { + fdt_index = i; + break; + } + } + + return fdt_index; +} + +/** + * update_crash_fdt - updates the cpus node of the crash FDT. + * + * @image: a pointer to kexec_crash_image + */ +static void update_crash_fdt(struct kimage *image) +{ + void *fdt; + int fdt_index; + + fdt_index = get_fdt_index(image); + if (fdt_index < 0) { + pr_err("Unable to locate FDT segment.\n"); + return; + } + + fdt = __va((void *)image->segment[fdt_index].mem); + + /* Temporarily invalidate the crash image while it is replaced */ + xchg(&kexec_crash_image, NULL); + + /* update FDT to reflect changes in CPU resources */ + if (update_cpus_node(fdt)) + pr_err("Failed to update crash FDT"); + + /* The crash image is now valid once again */ + xchg(&kexec_crash_image, image); +} + +int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) +{ +#ifdef CONFIG_KEXEC_FILE + if (image->file_mode) + return 1; +#endif + return kexec_flags & KEXEC_CRASH_HOTPLUG_SUPPORT; +} + +/** + * arch_crash_handle_hotplug_event - Handle crash CPU/Memory hotplug events to update the + * necessary kexec segments based on the hotplug event. + * @image: a pointer to kexec_crash_image + * @arg: struct memory_notify handler for memory hotplug case and NULL for CPU hotplug case. + * + * Update the kdump image based on the type of hotplug event, represented by image->hp_action. + * CPU add: Update the FDT segment to include the newly added CPU. + * CPU remove: No action is needed, with the assumption that it's okay to have offline CPUs + * part of the FDT. + * Memory add/remove: No action is taken as this is not yet supported. + */ +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) +{ + switch (image->hp_action) { + case KEXEC_CRASH_HP_REMOVE_CPU: + return; + + case KEXEC_CRASH_HP_ADD_CPU: + update_crash_fdt(image); + break; + + case KEXEC_CRASH_HP_REMOVE_MEMORY: + case KEXEC_CRASH_HP_ADD_MEMORY: + pr_info_once("Crash update is not supported for memory hotplug\n"); + return; + default: + pr_warn_once("Unknown hotplug action\n"); + } +} +#endif /* CONFIG_CRASH_HOTPLUG */ diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c index 6d8951e8e966..214c071c58ed 100644 --- a/arch/powerpc/kexec/elf_64.c +++ b/arch/powerpc/kexec/elf_64.c @@ -116,7 +116,8 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, if (ret) goto out_free_fdt; - fdt_pack(fdt); + if (!IS_ENABLED(CONFIG_CRASH_HOTPLUG) || image->type != KEXEC_TYPE_CRASH) + fdt_pack(fdt); kbuf.buffer = fdt; kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt); diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 4b94c31e3172..2640a804fcdf 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -30,6 +30,7 @@ #include #include #include +#include struct umem_info { __be64 *buf; /* data buffer for usable-memory property */ @@ -789,6 +790,9 @@ static unsigned int kdump_extra_fdt_size_ppc64(struct kimage *image) unsigned int cpu_nodes, extra_size = 0; struct device_node *dn; u64 usm_entries; +#ifdef CONFIG_CRASH_HOTPLUG + unsigned int possible_cpu_nodes; +#endif if (!IS_ENABLED(CONFIG_CRASH_DUMP) || image->type != KEXEC_TYPE_CRASH) return 0; @@ -816,6 +820,19 @@ static unsigned int kdump_extra_fdt_size_ppc64(struct kimage *image) if (cpu_nodes > boot_cpu_node_count) extra_size += (cpu_nodes - boot_cpu_node_count) * cpu_node_size(); +#ifdef CONFIG_CRASH_HOTPLUG + /* + * Make sure enough space is reserved to accommodate possible CPU nodes + * in the crash FDT. This allows packing possible CPU nodes which are + * not yet present in the system without regenerating the entire FDT. + */ + if (image->type == KEXEC_TYPE_CRASH) { + possible_cpu_nodes = num_possible_cpus() / threads_per_core; + if (possible_cpu_nodes > cpu_nodes) + extra_size += (possible_cpu_nodes - cpu_nodes) * cpu_node_size(); + } +#endif + return extra_size; } From 849599b702ef8977fcd5b2f27c61ef773c42bb88 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 26 Mar 2024 11:24:13 +0530 Subject: [PATCH 18/85] powerpc/crash: add crash memory hotplug support Extend the arch crash hotplug handler, as introduced by the patch title ("powerpc: add crash CPU hotplug support"), to also support memory add/remove events. Elfcorehdr describes the memory of the crash kernel to capture the kernel; hence, it needs to be updated if memory resources change due to memory add/remove events. Therefore, arch_crash_handle_hotplug_event() is updated to recreate the elfcorehdr and replace it with the previous one on memory add/remove events. The memblock list is used to prepare the elfcorehdr. In the case of memory hot remove, the memblock list is updated after the arch crash hotplug handler is triggered, as depicted in Figure 1. Thus, the hot-removed memory is explicitly removed from the crash memory ranges to ensure that the memory ranges added to elfcorehdr do not include the hot-removed memory. Memory remove | v Offline pages | v Initiate memory notify call <----> crash hotplug handler chain for MEM_OFFLINE event | v Update memblock list Figure 1 There are two system calls, `kexec_file_load` and `kexec_load`, used to load the kdump image. A few changes have been made to ensure that the kernel can safely update the elfcorehdr component of the kdump image for both system calls. For the kexec_file_load syscall, kdump image is prepared in the kernel. To support an increasing number of memory regions, the elfcorehdr is built with extra buffer space to ensure that it can accommodate additional memory ranges in future. For the kexec_load syscall, the elfcorehdr is updated only if the KEXEC_CRASH_HOTPLUG_SUPPORT kexec flag is passed to the kernel by the kexec tool. Passing this flag to the kernel indicates that the elfcorehdr is built to accommodate additional memory ranges and the elfcorehdr segment is not considered for SHA calculation, making it safe to update. The changes related to this feature are kept under the CRASH_HOTPLUG config, and it is enabled by default. Signed-off-by: Sourabh Jain Acked-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240326055413.186534-7-sourabhjain@linux.ibm.com --- arch/powerpc/include/asm/kexec.h | 3 + arch/powerpc/include/asm/kexec_ranges.h | 1 + arch/powerpc/kexec/crash.c | 95 ++++++++++++++++++++++++- arch/powerpc/kexec/file_load_64.c | 20 +++++- arch/powerpc/kexec/ranges.c | 85 ++++++++++++++++++++++ 5 files changed, 202 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index e75970351bcd..95a98b390d62 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -141,6 +141,9 @@ void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags); #define arch_crash_hotplug_support arch_crash_hotplug_support + +unsigned int arch_crash_get_elfcorehdr_size(void); +#define crash_get_elfcorehdr_size arch_crash_get_elfcorehdr_size #endif /* CONFIG_CRASH_HOTPLUG */ extern int crashing_cpu; diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index 8489e844b447..14055896cbcb 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -7,6 +7,7 @@ void sort_memory_ranges(struct crash_mem *mrngs, bool merge); struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges); int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); +int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); int get_exclude_memory_ranges(struct crash_mem **mem_ranges); int get_reserved_memory_ranges(struct crash_mem **mem_ranges); int get_crash_memory_ranges(struct crash_mem **mem_ranges); diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index 8938a19af12f..21b193e938a3 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include /* * The primary CPU waits a while for all secondary CPUs to enter. This is to @@ -398,6 +400,94 @@ void default_machine_crash_shutdown(struct pt_regs *regs) #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt +/* + * Advertise preferred elfcorehdr size to userspace via + * /sys/kernel/crash_elfcorehdr_size sysfs interface. + */ +unsigned int arch_crash_get_elfcorehdr_size(void) +{ + unsigned long phdr_cnt; + + /* A program header for possible CPUs + vmcoreinfo */ + phdr_cnt = num_possible_cpus() + 1; + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) + phdr_cnt += CONFIG_CRASH_MAX_MEMORY_RANGES; + + return sizeof(struct elfhdr) + (phdr_cnt * sizeof(Elf64_Phdr)); +} + +/** + * update_crash_elfcorehdr() - Recreate the elfcorehdr and replace it with old + * elfcorehdr in the kexec segment array. + * @image: the active struct kimage + * @mn: struct memory_notify data handler + */ +static void update_crash_elfcorehdr(struct kimage *image, struct memory_notify *mn) +{ + int ret; + struct crash_mem *cmem = NULL; + struct kexec_segment *ksegment; + void *ptr, *mem, *elfbuf = NULL; + unsigned long elfsz, memsz, base_addr, size; + + ksegment = &image->segment[image->elfcorehdr_index]; + mem = (void *) ksegment->mem; + memsz = ksegment->memsz; + + ret = get_crash_memory_ranges(&cmem); + if (ret) { + pr_err("Failed to get crash mem range\n"); + return; + } + + /* + * The hot unplugged memory is part of crash memory ranges, + * remove it here. + */ + if (image->hp_action == KEXEC_CRASH_HP_REMOVE_MEMORY) { + base_addr = PFN_PHYS(mn->start_pfn); + size = mn->nr_pages * PAGE_SIZE; + ret = remove_mem_range(&cmem, base_addr, size); + if (ret) { + pr_err("Failed to remove hot-unplugged memory from crash memory ranges\n"); + goto out; + } + } + + ret = crash_prepare_elf64_headers(cmem, false, &elfbuf, &elfsz); + if (ret) { + pr_err("Failed to prepare elf header\n"); + goto out; + } + + /* + * It is unlikely that kernel hit this because elfcorehdr kexec + * segment (memsz) is built with addition space to accommodate growing + * number of crash memory ranges while loading the kdump kernel. It is + * Just to avoid any unforeseen case. + */ + if (elfsz > memsz) { + pr_err("Updated crash elfcorehdr elfsz %lu > memsz %lu", elfsz, memsz); + goto out; + } + + ptr = __va(mem); + if (ptr) { + /* Temporarily invalidate the crash image while it is replaced */ + xchg(&kexec_crash_image, NULL); + + /* Replace the old elfcorehdr with newly prepared elfcorehdr */ + memcpy((void *)ptr, elfbuf, elfsz); + + /* The crash image is now valid once again */ + xchg(&kexec_crash_image, image); + } +out: + kvfree(cmem); + if (elfbuf) + kvfree(elfbuf); +} + /** * get_fdt_index - Loop through the kexec segment array and find * the index of the FDT segment. @@ -478,6 +568,8 @@ int arch_crash_hotplug_support(struct kimage *image, unsigned long kexec_flags) */ void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { + struct memory_notify *mn; + switch (image->hp_action) { case KEXEC_CRASH_HP_REMOVE_CPU: return; @@ -488,7 +580,8 @@ void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) case KEXEC_CRASH_HP_REMOVE_MEMORY: case KEXEC_CRASH_HP_ADD_MEMORY: - pr_info_once("Crash update is not supported for memory hotplug\n"); + mn = (struct memory_notify *)arg; + update_crash_elfcorehdr(image, mn); return; default: pr_warn_once("Unknown hotplug action\n"); diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 2640a804fcdf..925a69ad2468 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -595,6 +595,23 @@ static void update_backup_region_phdr(struct kimage *image, Elf64_Ehdr *ehdr) } } +static unsigned int kdump_extra_elfcorehdr_size(struct crash_mem *cmem) +{ +#if defined(CONFIG_CRASH_HOTPLUG) && defined(CONFIG_MEMORY_HOTPLUG) + unsigned int extra_sz = 0; + + if (CONFIG_CRASH_MAX_MEMORY_RANGES > (unsigned int)PN_XNUM) + pr_warn("Number of Phdrs %u exceeds max\n", CONFIG_CRASH_MAX_MEMORY_RANGES); + else if (cmem->nr_ranges >= CONFIG_CRASH_MAX_MEMORY_RANGES) + pr_warn("Configured crash mem ranges may not be enough\n"); + else + extra_sz = (CONFIG_CRASH_MAX_MEMORY_RANGES - cmem->nr_ranges) * sizeof(Elf64_Phdr); + + return extra_sz; +#endif + return 0; +} + /** * load_elfcorehdr_segment - Setup crash memory ranges and initialize elfcorehdr * segment needed to load kdump kernel. @@ -626,7 +643,8 @@ static int load_elfcorehdr_segment(struct kimage *image, struct kexec_buf *kbuf) kbuf->buffer = headers; kbuf->mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf->bufsz = kbuf->memsz = headers_sz; + kbuf->bufsz = headers_sz; + kbuf->memsz = headers_sz + kdump_extra_elfcorehdr_size(cmem); kbuf->top_down = false; ret = kexec_add_buffer(kbuf); diff --git a/arch/powerpc/kexec/ranges.c b/arch/powerpc/kexec/ranges.c index b1f4267d9b17..3702b0bdab14 100644 --- a/arch/powerpc/kexec/ranges.c +++ b/arch/powerpc/kexec/ranges.c @@ -620,4 +620,89 @@ out: pr_err("Failed to setup crash memory ranges\n"); return ret; } + +/** + * remove_mem_range - Removes the given memory range from the range list. + * @mem_ranges: Range list to remove the memory range to. + * @base: Base address of the range to remove. + * @size: Size of the memory range to remove. + * + * (Re)allocates memory, if needed. + * + * Returns 0 on success, negative errno on error. + */ +int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size) +{ + u64 end; + int ret = 0; + unsigned int i; + u64 mstart, mend; + struct crash_mem *mem_rngs = *mem_ranges; + + if (!size) + return 0; + + /* + * Memory range are stored as start and end address, use + * the same format to do remove operation. + */ + end = base + size - 1; + + for (i = 0; i < mem_rngs->nr_ranges; i++) { + mstart = mem_rngs->ranges[i].start; + mend = mem_rngs->ranges[i].end; + + /* + * Memory range to remove is not part of this range entry + * in the memory range list + */ + if (!(base >= mstart && end <= mend)) + continue; + + /* + * Memory range to remove is equivalent to this entry in the + * memory range list. Remove the range entry from the list. + */ + if (base == mstart && end == mend) { + for (; i < mem_rngs->nr_ranges - 1; i++) { + mem_rngs->ranges[i].start = mem_rngs->ranges[i+1].start; + mem_rngs->ranges[i].end = mem_rngs->ranges[i+1].end; + } + mem_rngs->nr_ranges--; + goto out; + } + /* + * Start address of the memory range to remove and the + * current memory range entry in the list is same. Just + * move the start address of the current memory range + * entry in the list to end + 1. + */ + else if (base == mstart) { + mem_rngs->ranges[i].start = end + 1; + goto out; + } + /* + * End address of the memory range to remove and the + * current memory range entry in the list is same. + * Just move the end address of the current memory + * range entry in the list to base - 1. + */ + else if (end == mend) { + mem_rngs->ranges[i].end = base - 1; + goto out; + } + /* + * Memory range to remove is not at the edge of current + * memory range entry. Split the current memory entry into + * two half. + */ + else { + mem_rngs->ranges[i].end = base - 1; + size = mem_rngs->ranges[i].end - end; + ret = add_mem_range(mem_ranges, end + 1, size); + } + } +out: + return ret; +} #endif /* CONFIG_CRASH_DUMP */ From 9c74ecfd0fc46e2eaf92c1b6169cc0c8a87f1dc2 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 12 Apr 2024 14:50:46 +0530 Subject: [PATCH 19/85] powerpc/pseries: Add pool idle time at LPAR boot When there are no options specified for lparstat, it is expected to give reports since LPAR(Logical Partition) boot. APP(Available Processor Pool) is an indicator of how many cores in the shared pool are free to use in Shared Processor LPAR(SPLPAR). APP is derived using pool_idle_time which is obtained using H_PIC call. The interval based reports show correct APP value while since boot report shows very high APP values. This happens because in that case APP is obtained by dividing pool idle time by LPAR uptime. Since pool idle time is reported by the PowerVM hypervisor since its boot, it need not align with LPAR boot. To fix that export boot pool idle time in lparcfg and powerpc-utils will use this info to derive APP as below for since boot reports. APP = (pool idle time - boot pool idle time) / (uptime * timebase) Results:: Observe APP values. ====================== Shared LPAR ================================ lparstat System Configuration type=Shared mode=Uncapped smt=8 lcpu=12 mem=15573440 kB cpus=37 ent=12.00 reboot stress-ng --cpu=$(nproc) -t 600 sleep 600 So in this case app is expected to close to 37-6=31. ====== 6.9-rc1 and lparstat 1.3.10 ============= %user %sys %wait %idle physc %entc lbusy app vcsw phint ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 47.48 0.01 0.00 52.51 0.00 0.00 47.49 69099.72 541547 21 === With this patch and powerpc-utils patch to do the above equation === %user %sys %wait %idle physc %entc lbusy app vcsw phint ----- ----- ----- ----- ----- ----- ----- ----- ----- ----- 47.48 0.01 0.00 52.51 5.73 47.75 47.49 31.21 541753 21 ===================================================================== Note: physc, purr/idle purr being inaccurate is being handled in a separate patch in powerpc-utils tree. Signed-off-by: Shrikanth Hegde Signed-off-by: Michael Ellerman Link: https://msgid.link/20240412092047.455483-2-sshegde@linux.ibm.com --- arch/powerpc/platforms/pseries/lparcfg.c | 39 ++++++++++++++++++------ 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index f73c4d1c26af..679efcf21628 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -170,20 +170,24 @@ out: kfree(buf); } -static unsigned h_pic(unsigned long *pool_idle_time, - unsigned long *num_procs) +static long h_pic(unsigned long *pool_idle_time, + unsigned long *num_procs) { - unsigned long rc; - unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE] = {0}; rc = plpar_hcall(H_PIC, retbuf); - *pool_idle_time = retbuf[0]; - *num_procs = retbuf[1]; + if (pool_idle_time) + *pool_idle_time = retbuf[0]; + if (num_procs) + *num_procs = retbuf[1]; return rc; } +unsigned long boot_pool_idle_time; + /* * parse_ppp_data * Parse out the data returned from h_get_ppp and h_pic @@ -215,9 +219,15 @@ static void parse_ppp_data(struct seq_file *m) seq_printf(m, "pool_capacity=%d\n", ppp_data.active_procs_in_pool * 100); - h_pic(&pool_idle_time, &pool_procs); - seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); - seq_printf(m, "pool_num_procs=%ld\n", pool_procs); + /* In case h_pic call is not successful, this would result in + * APP values being wrong in tools like lparstat. + */ + + if (h_pic(&pool_idle_time, &pool_procs) == H_SUCCESS) { + seq_printf(m, "pool_idle_time=%ld\n", pool_idle_time); + seq_printf(m, "pool_num_procs=%ld\n", pool_procs); + seq_printf(m, "boot_pool_idle_time=%ld\n", boot_pool_idle_time); + } } seq_printf(m, "unallocated_capacity_weight=%d\n", @@ -792,6 +802,7 @@ static const struct proc_ops lparcfg_proc_ops = { static int __init lparcfg_init(void) { umode_t mode = 0444; + long retval; /* Allow writing if we have FW_FEATURE_SPLPAR */ if (firmware_has_feature(FW_FEATURE_SPLPAR)) @@ -801,6 +812,16 @@ static int __init lparcfg_init(void) printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); return -EIO; } + + /* If this call fails, it would result in APP values + * being wrong for since boot reports of lparstat + */ + retval = h_pic(&boot_pool_idle_time, NULL); + + if (retval != H_SUCCESS) + pr_debug("H_PIC failed during lparcfg init retval: %ld\n", + retval); + return 0; } machine_device_initcall(pseries, lparcfg_init); From 6d4341638516bf97b9a34947e0bd95035a8230a5 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Fri, 12 Apr 2024 14:50:47 +0530 Subject: [PATCH 20/85] powerpc/pseries: Add failure related checks for h_get_mpp and h_get_ppp Couple of Minor fixes: - hcall return values are long. Fix that for h_get_mpp, h_get_ppp and parse_ppp_data - If hcall fails, values set should be at-least zero. It shouldn't be uninitialized values. Fix that for h_get_mpp and h_get_ppp Signed-off-by: Shrikanth Hegde Signed-off-by: Michael Ellerman Link: https://msgid.link/20240412092047.455483-3-sshegde@linux.ibm.com --- arch/powerpc/include/asm/hvcall.h | 2 +- arch/powerpc/platforms/pseries/lpar.c | 6 +++--- arch/powerpc/platforms/pseries/lparcfg.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index a41e542ba94d..51172625fa3a 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -570,7 +570,7 @@ struct hvcall_mpp_data { unsigned long backing_mem; }; -int h_get_mpp(struct hvcall_mpp_data *); +long h_get_mpp(struct hvcall_mpp_data *mpp_data); struct hvcall_mpp_x_data { unsigned long coalesced_bytes; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 4e9916bb03d7..c1d8bee8f701 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -1886,10 +1886,10 @@ out: * h_get_mpp * H_GET_MPP hcall returns info in 7 parms */ -int h_get_mpp(struct hvcall_mpp_data *mpp_data) +long h_get_mpp(struct hvcall_mpp_data *mpp_data) { - int rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + long rc; rc = plpar_hcall9(H_GET_MPP, retbuf); diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 679efcf21628..6e7029640c0c 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -113,8 +113,8 @@ struct hvcall_ppp_data { */ static unsigned int h_get_ppp(struct hvcall_ppp_data *ppp_data) { - unsigned long rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + long rc; rc = plpar_hcall9(H_GET_PPP, retbuf); @@ -197,7 +197,7 @@ static void parse_ppp_data(struct seq_file *m) struct hvcall_ppp_data ppp_data; struct device_node *root; const __be32 *perf_level; - int rc; + long rc; rc = h_get_ppp(&ppp_data); if (rc) From c6c5b14dac0d1bd0da8b4d1d3b77f18eb9085fcb Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 23 Apr 2024 01:29:30 +0530 Subject: [PATCH 21/85] powerpc: make fadump resilient with memory add/remove events Due to changes in memory resources caused by either memory hotplug or online/offline events, the elfcorehdr, which describes the CPUs and memory of the crashed kernel to the kernel that collects the dump (known as second/fadump kernel), becomes outdated. Consequently, attempting dump collection with an outdated elfcorehdr can lead to failed or inaccurate dump collection. Memory hotplug or online/offline events is referred as memory add/remove events in reset of the commit message. The current solution to address the aforementioned issue is as follows: Monitor memory add/remove events in userspace using udev rules, and re-register fadump whenever there are changes in memory resources. This leads to the creation of a new elfcorehdr with updated system memory information. There are several notable issues associated with re-registering fadump for every memory add/remove events. 1. Bulk memory add/remove events with udev-based fadump re-registration can lead to race conditions and, more importantly, it creates a wide window during which fadump is inactive until all memory add/remove events are settled. 2. Re-registering fadump for every memory add/remove event is inefficient. 3. The memory for elfcorehdr is allocated based on the memblock regions available during early boot and remains fixed thereafter. However, if elfcorehdr is later recreated with additional memblock regions, its size will increase, potentially leading to memory corruption. Address the aforementioned challenges by shifting the creation of elfcorehdr from the first kernel (also referred as the crashed kernel), where it was created and frequently recreated for every memory add/remove event, to the fadump kernel. As a result, the elfcorehdr only needs to be created once, thus eliminating the necessity to re-register fadump during memory add/remove events. At present, the first kernel prepares fadump header and stores it in the fadump reserved area. The fadump header includes the start address of the elfcorehdr, crashing CPU details, and other relevant information. In the event of a crash in the first kernel, the second/fadump boots and accesses the fadump header prepared by the first kernel. It then performs the following steps in a platform-specific function [rtas|opal]_fadump_process: 1. Sanity check for fadump header 2. Update CPU notes in elfcorehdr Along with the above, update the setup_fadump()/fadump.c to create elfcorehdr and set its address to the global variable elfcorehdr_addr for the vmcore module to process it in the second/fadump kernel. Section below outlines the information required to create the elfcorehdr and the changes made to make it available to the fadump kernel if it's not already. To create elfcorehdr, the following crashed kernel information is required: CPU notes, vmcoreinfo, and memory ranges. At present, the CPU notes are already prepared in the fadump kernel, so no changes are needed in that regard. The fadump kernel has access to all crashed kernel memory regions, including boot memory regions that are relocated by firmware to fadump reserved areas, so no changes for that either. However, it is necessary to add new members to the fadump header, i.e., the 'fadump_crash_info_header' structure, in order to pass the crashed kernel's vmcoreinfo address and its size to fadump kernel. In addition to the vmcoreinfo address and size, there are a few other attributes also added to the fadump_crash_info_header structure. 1. version: It stores the fadump header version, which is currently set to 1. This provides flexibility to update the fadump crash info header in the future without changing the magic number. For each change in the fadump header, the version will be increased. This will help the updated kernel determine how to handle kernel dumps from older kernels. The magic number remains relevant for checking fadump header corruption. 2. pt_regs_sz/cpu_mask_sz: Store size of pt_regs and cpu_mask structure of first kernel. These attributes are used to prevent dump processing if the sizes of pt_regs or cpu_mask structure differ between the first and fadump kernels. Note: if either first/crashed kernel or second/fadump kernel do not have the changes introduced here then kernel fail to collect the dump and prints relevant error message on the console. Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422195932.1583833-2-sourabhjain@linux.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 31 +- arch/powerpc/kernel/fadump.c | 361 +++++++++++-------- arch/powerpc/platforms/powernv/opal-fadump.c | 22 +- arch/powerpc/platforms/pseries/rtas-fadump.c | 34 +- 4 files changed, 242 insertions(+), 206 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 27f9e11eda28..5d706a7acc8a 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -42,13 +42,38 @@ static inline u64 fadump_str_to_u64(const char *str) #define FADUMP_CPU_UNKNOWN (~((u32)0)) -#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPINF") +/* + * The introduction of new fields in the fadump crash info header has + * led to a change in the magic key from `FADMPINF` to `FADMPSIG` for + * identifying a kernel crash from an old kernel. + * + * To prevent the need for further changes to the magic number in the + * event of future modifications to the fadump crash info header, a + * version field has been introduced to track the fadump crash info + * header version. + * + * Consider a few points before adding new members to the fadump crash info + * header structure: + * + * - Append new members; avoid adding them in between. + * - Non-primitive members should have a size member as well. + * - For every change in the fadump header, increment the + * fadump header version. This helps the updated kernel decide how to + * handle kernel dumps from older kernels. + */ +#define FADUMP_CRASH_INFO_MAGIC_OLD fadump_str_to_u64("FADMPINF") +#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPSIG") +#define FADUMP_HEADER_VERSION 1 /* fadump crash info structure */ struct fadump_crash_info_header { u64 magic_number; - u64 elfcorehdr_addr; + u32 version; u32 crashing_cpu; + u64 vmcoreinfo_raddr; + u64 vmcoreinfo_size; + u32 pt_regs_sz; + u32 cpu_mask_sz; struct pt_regs regs; struct cpumask cpu_mask; }; @@ -94,6 +119,8 @@ struct fw_dump { u64 boot_mem_regs_cnt; unsigned long fadumphdr_addr; + u64 elfcorehdr_addr; + u64 elfcorehdr_size; unsigned long cpu_notes_buf_vaddr; unsigned long cpu_notes_buf_size; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 4e768d93c6d4..a020597c065a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -53,8 +53,6 @@ static struct kobject *fadump_kobj; static atomic_t cpus_in_fadump; static DEFINE_MUTEX(fadump_mutex); -static struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; - #define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ #define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ sizeof(struct fadump_memory_range)) @@ -373,12 +371,6 @@ static unsigned long __init get_fadump_area_size(void) size = PAGE_ALIGN(size); size += fw_dump.boot_memory_size; size += sizeof(struct fadump_crash_info_header); - size += sizeof(struct elfhdr); /* ELF core header.*/ - size += sizeof(struct elf_phdr); /* place holder for cpu notes */ - /* Program headers for crash memory regions. */ - size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); - - size = PAGE_ALIGN(size); /* This is to hold kernel metadata on platforms that support it */ size += (fw_dump.ops->fadump_get_metadata_size ? @@ -915,36 +907,6 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, return 0; } -static int fadump_exclude_reserved_area(u64 start, u64 end) -{ - u64 ra_start, ra_end; - int ret = 0; - - ra_start = fw_dump.reserve_dump_area_start; - ra_end = ra_start + fw_dump.reserve_dump_area_size; - - if ((ra_start < end) && (ra_end > start)) { - if ((start < ra_start) && (end > ra_end)) { - ret = fadump_add_mem_range(&crash_mrange_info, - start, ra_start); - if (ret) - return ret; - - ret = fadump_add_mem_range(&crash_mrange_info, - ra_end, end); - } else if (start < ra_start) { - ret = fadump_add_mem_range(&crash_mrange_info, - start, ra_start); - } else if (ra_end < end) { - ret = fadump_add_mem_range(&crash_mrange_info, - ra_end, end); - } - } else - ret = fadump_add_mem_range(&crash_mrange_info, start, end); - - return ret; -} - static int fadump_init_elfcore_header(char *bufp) { struct elfhdr *elf; @@ -981,52 +943,6 @@ static int fadump_init_elfcore_header(char *bufp) return 0; } -/* - * Traverse through memblock structure and setup crash memory ranges. These - * ranges will be used create PT_LOAD program headers in elfcore header. - */ -static int fadump_setup_crash_memory_ranges(void) -{ - u64 i, start, end; - int ret; - - pr_debug("Setup crash memory ranges.\n"); - crash_mrange_info.mem_range_cnt = 0; - - /* - * Boot memory region(s) registered with firmware are moved to - * different location at the time of crash. Create separate program - * header(s) for this memory chunk(s) with the correct offset. - */ - for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { - start = fw_dump.boot_mem_addr[i]; - end = start + fw_dump.boot_mem_sz[i]; - ret = fadump_add_mem_range(&crash_mrange_info, start, end); - if (ret) - return ret; - } - - for_each_mem_range(i, &start, &end) { - /* - * skip the memory chunk that is already added - * (0 through boot_memory_top). - */ - if (start < fw_dump.boot_mem_top) { - if (end > fw_dump.boot_mem_top) - start = fw_dump.boot_mem_top; - else - continue; - } - - /* add this range excluding the reserved dump area. */ - ret = fadump_exclude_reserved_area(start, end); - if (ret) - return ret; - } - - return 0; -} - /* * If the given physical address falls within the boot memory region then * return the relocated address that points to the dump region reserved @@ -1057,36 +973,50 @@ static inline unsigned long fadump_relocate(unsigned long paddr) return raddr; } -static int fadump_create_elfcore_headers(char *bufp) +static void __init populate_elf_pt_load(struct elf_phdr *phdr, u64 start, + u64 size, unsigned long long offset) { - unsigned long long raddr, offset; - struct elf_phdr *phdr; - struct elfhdr *elf; - int i, j; + phdr->p_align = 0; + phdr->p_memsz = size; + phdr->p_filesz = size; + phdr->p_paddr = start; + phdr->p_offset = offset; + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (unsigned long)__va(start); +} +static void __init fadump_populate_elfcorehdr(struct fadump_crash_info_header *fdh) +{ + char *bufp; + struct elfhdr *elf; + struct elf_phdr *phdr; + u64 boot_mem_dest_offset; + unsigned long long i, ra_start, ra_end, ra_size, mstart, mend; + + bufp = (char *) fw_dump.elfcorehdr_addr; fadump_init_elfcore_header(bufp); elf = (struct elfhdr *)bufp; bufp += sizeof(struct elfhdr); /* - * setup ELF PT_NOTE, place holder for cpu notes info. The notes info - * will be populated during second kernel boot after crash. Hence - * this PT_NOTE will always be the first elf note. + * Set up ELF PT_NOTE, a placeholder for CPU notes information. + * The notes info will be populated later by platform-specific code. + * Hence, this PT_NOTE will always be the first ELF note. * * NOTE: Any new ELF note addition should be placed after this note. */ phdr = (struct elf_phdr *)bufp; bufp += sizeof(struct elf_phdr); phdr->p_type = PT_NOTE; - phdr->p_flags = 0; - phdr->p_vaddr = 0; - phdr->p_align = 0; - - phdr->p_offset = 0; - phdr->p_paddr = 0; - phdr->p_filesz = 0; - phdr->p_memsz = 0; - + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + phdr->p_offset = 0; + phdr->p_paddr = 0; + phdr->p_filesz = 0; + phdr->p_memsz = 0; + /* Increment number of program headers. */ (elf->e_phnum)++; /* setup ELF PT_NOTE for vmcoreinfo */ @@ -1096,55 +1026,66 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_flags = 0; phdr->p_vaddr = 0; phdr->p_align = 0; - - phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); - phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; - + phdr->p_paddr = phdr->p_offset = fdh->vmcoreinfo_raddr; + phdr->p_memsz = phdr->p_filesz = fdh->vmcoreinfo_size; /* Increment number of program headers. */ (elf->e_phnum)++; - /* setup PT_LOAD sections. */ - j = 0; - offset = 0; - raddr = fw_dump.boot_mem_addr[0]; - for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) { - u64 mbase, msize; - - mbase = crash_mrange_info.mem_ranges[i].base; - msize = crash_mrange_info.mem_ranges[i].size; - if (!msize) - continue; - + /* + * Setup PT_LOAD sections. first include boot memory regions + * and then add rest of the memory regions. + */ + boot_mem_dest_offset = fw_dump.boot_mem_dest_addr; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { phdr = (struct elf_phdr *)bufp; bufp += sizeof(struct elf_phdr); - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = mbase; + populate_elf_pt_load(phdr, fw_dump.boot_mem_addr[i], + fw_dump.boot_mem_sz[i], + boot_mem_dest_offset); + /* Increment number of program headers. */ + (elf->e_phnum)++; + boot_mem_dest_offset += fw_dump.boot_mem_sz[i]; + } - if (mbase == raddr) { - /* - * The entire real memory region will be moved by - * firmware to the specified destination_address. - * Hence set the correct offset. - */ - phdr->p_offset = fw_dump.boot_mem_dest_addr + offset; - if (j < (fw_dump.boot_mem_regs_cnt - 1)) { - offset += fw_dump.boot_mem_sz[j]; - raddr = fw_dump.boot_mem_addr[++j]; - } + /* Memory reserved for fadump in first kernel */ + ra_start = fw_dump.reserve_dump_area_start; + ra_size = get_fadump_area_size(); + ra_end = ra_start + ra_size; + + phdr = (struct elf_phdr *)bufp; + for_each_mem_range(i, &mstart, &mend) { + /* Boot memory regions already added, skip them now */ + if (mstart < fw_dump.boot_mem_top) { + if (mend > fw_dump.boot_mem_top) + mstart = fw_dump.boot_mem_top; + else + continue; } - phdr->p_paddr = mbase; - phdr->p_vaddr = (unsigned long)__va(mbase); - phdr->p_filesz = msize; - phdr->p_memsz = msize; - phdr->p_align = 0; + /* Handle memblock regions overlaps with fadump reserved area */ + if ((ra_start < mend) && (ra_end > mstart)) { + if ((mstart < ra_start) && (mend > ra_end)) { + populate_elf_pt_load(phdr, mstart, ra_start - mstart, mstart); + /* Increment number of program headers. */ + (elf->e_phnum)++; + bufp += sizeof(struct elf_phdr); + phdr = (struct elf_phdr *)bufp; + populate_elf_pt_load(phdr, ra_end, mend - ra_end, ra_end); + } else if (mstart < ra_start) { + populate_elf_pt_load(phdr, mstart, ra_start - mstart, mstart); + } else if (ra_end < mend) { + populate_elf_pt_load(phdr, ra_end, mend - ra_end, ra_end); + } + } else { + /* No overlap with fadump reserved memory region */ + populate_elf_pt_load(phdr, mstart, mend - mstart, mstart); + } /* Increment number of program headers. */ (elf->e_phnum)++; + bufp += sizeof(struct elf_phdr); + phdr = (struct elf_phdr *) bufp; } - return 0; } static unsigned long init_fadump_header(unsigned long addr) @@ -1159,14 +1100,25 @@ static unsigned long init_fadump_header(unsigned long addr) memset(fdh, 0, sizeof(struct fadump_crash_info_header)); fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; - fdh->elfcorehdr_addr = addr; + fdh->version = FADUMP_HEADER_VERSION; /* We will set the crashing cpu id in crash_fadump() during crash. */ fdh->crashing_cpu = FADUMP_CPU_UNKNOWN; + + /* + * The physical address and size of vmcoreinfo are required in the + * second kernel to prepare elfcorehdr. + */ + fdh->vmcoreinfo_raddr = fadump_relocate(paddr_vmcoreinfo_note()); + fdh->vmcoreinfo_size = VMCOREINFO_NOTE_SIZE; + + + fdh->pt_regs_sz = sizeof(struct pt_regs); /* * When LPAR is terminated by PYHP, ensure all possible CPUs' * register data is processed while exporting the vmcore. */ fdh->cpu_mask = *cpu_possible_mask; + fdh->cpu_mask_sz = sizeof(struct cpumask); return addr; } @@ -1174,8 +1126,6 @@ static unsigned long init_fadump_header(unsigned long addr) static int register_fadump(void) { unsigned long addr; - void *vaddr; - int ret; /* * If no memory is reserved then we can not register for firmware- @@ -1184,18 +1134,10 @@ static int register_fadump(void) if (!fw_dump.reserve_dump_area_size) return -ENODEV; - ret = fadump_setup_crash_memory_ranges(); - if (ret) - return ret; - addr = fw_dump.fadumphdr_addr; /* Initialize fadump crash info header. */ addr = init_fadump_header(addr); - vaddr = __va(addr); - - pr_debug("Creating ELF core headers at %#016lx\n", addr); - fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -1214,7 +1156,6 @@ void fadump_cleanup(void) } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ fw_dump.ops->fadump_unregister(&fw_dump); - fadump_free_mem_ranges(&crash_mrange_info); } if (fw_dump.ops->fadump_cleanup) @@ -1400,6 +1341,22 @@ static void fadump_release_memory(u64 begin, u64 end) fadump_release_reserved_area(tstart, end); } +static void fadump_free_elfcorehdr_buf(void) +{ + if (fw_dump.elfcorehdr_addr == 0 || fw_dump.elfcorehdr_size == 0) + return; + + /* + * Before freeing the memory of `elfcorehdr`, reset the global + * `elfcorehdr_addr` to prevent modules like `vmcore` from accessing + * invalid memory. + */ + elfcorehdr_addr = ELFCORE_ADDR_ERR; + fadump_free_buffer(fw_dump.elfcorehdr_addr, fw_dump.elfcorehdr_size); + fw_dump.elfcorehdr_addr = 0; + fw_dump.elfcorehdr_size = 0; +} + static void fadump_invalidate_release_mem(void) { mutex_lock(&fadump_mutex); @@ -1411,6 +1368,7 @@ static void fadump_invalidate_release_mem(void) fadump_cleanup(); mutex_unlock(&fadump_mutex); + fadump_free_elfcorehdr_buf(); fadump_release_memory(fw_dump.boot_mem_top, memblock_end_of_DRAM()); fadump_free_cpu_notes_buf(); @@ -1616,6 +1574,102 @@ static void __init fadump_init_files(void) return; } +static int __init fadump_setup_elfcorehdr_buf(void) +{ + int elf_phdr_cnt; + unsigned long elfcorehdr_size; + + /* + * Program header for CPU notes comes first, followed by one for + * vmcoreinfo, and the remaining program headers correspond to + * memory regions. + */ + elf_phdr_cnt = 2 + fw_dump.boot_mem_regs_cnt + memblock_num_regions(memory); + elfcorehdr_size = sizeof(struct elfhdr) + (elf_phdr_cnt * sizeof(struct elf_phdr)); + elfcorehdr_size = PAGE_ALIGN(elfcorehdr_size); + + fw_dump.elfcorehdr_addr = (u64)fadump_alloc_buffer(elfcorehdr_size); + if (!fw_dump.elfcorehdr_addr) { + pr_err("Failed to allocate %lu bytes for elfcorehdr\n", + elfcorehdr_size); + return -ENOMEM; + } + fw_dump.elfcorehdr_size = elfcorehdr_size; + return 0; +} + +/* + * Check if the fadump header of crashed kernel is compatible with fadump kernel. + * + * It checks the magic number, endianness, and size of non-primitive type + * members of fadump header to ensure safe dump collection. + */ +static bool __init is_fadump_header_compatible(struct fadump_crash_info_header *fdh) +{ + if (fdh->magic_number == FADUMP_CRASH_INFO_MAGIC_OLD) { + pr_err("Old magic number, can't process the dump.\n"); + return false; + } + + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + if (fdh->magic_number == swab64(FADUMP_CRASH_INFO_MAGIC)) + pr_err("Endianness mismatch between the crashed and fadump kernels.\n"); + else + pr_err("Fadump header is corrupted.\n"); + + return false; + } + + /* + * Dump collection is not safe if the size of non-primitive type members + * of the fadump header do not match between crashed and fadump kernel. + */ + if (fdh->pt_regs_sz != sizeof(struct pt_regs) || + fdh->cpu_mask_sz != sizeof(struct cpumask)) { + pr_err("Fadump header size mismatch.\n"); + return false; + } + + return true; +} + +static void __init fadump_process(void) +{ + struct fadump_crash_info_header *fdh; + + fdh = (struct fadump_crash_info_header *) __va(fw_dump.fadumphdr_addr); + if (!fdh) { + pr_err("Crash info header is empty.\n"); + goto err_out; + } + + /* Avoid processing the dump if fadump header isn't compatible */ + if (!is_fadump_header_compatible(fdh)) + goto err_out; + + /* Allocate buffer for elfcorehdr */ + if (fadump_setup_elfcorehdr_buf()) + goto err_out; + + fadump_populate_elfcorehdr(fdh); + + /* Let platform update the CPU notes in elfcorehdr */ + if (fw_dump.ops->fadump_process(&fw_dump) < 0) + goto err_out; + + /* + * elfcorehdr is now ready to be exported. + * + * set elfcorehdr_addr so that vmcore module will export the + * elfcorehdr through '/proc/vmcore'. + */ + elfcorehdr_addr = virt_to_phys((void *)fw_dump.elfcorehdr_addr); + return; + +err_out: + fadump_invalidate_release_mem(); +} + /* * Prepare for firmware-assisted dump. */ @@ -1635,12 +1689,7 @@ int __init setup_fadump(void) * saving it to the disk. */ if (fw_dump.dump_active) { - /* - * if dump process fails then invalidate the registration - * and release memory before proceeding for re-registration. - */ - if (fw_dump.ops->fadump_process(&fw_dump) < 0) - fadump_invalidate_release_mem(); + fadump_process(); } /* Initialize the kernel dump memory structure and register with f/w */ else if (fw_dump.reserve_dump_area_size) { diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 964f464b1b0e..767a6b19e42a 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -513,8 +513,8 @@ out: final_note(note_buf); pr_debug("Updating elfcore header (%llx) with cpu notes\n", - fdh->elfcorehdr_addr); - fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); + fadump_conf->elfcorehdr_addr); + fadump_update_elfcore_header((char *)fadump_conf->elfcorehdr_addr); return 0; } @@ -526,12 +526,7 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) return rc; - /* Validate the fadump crash info header */ fdh = __va(fadump_conf->fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - pr_err("Crash info header is not valid.\n"); - return rc; - } #ifdef CONFIG_OPAL_CORE /* @@ -545,18 +540,7 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) kernel_initiated = true; #endif - rc = opal_fadump_build_cpu_notes(fadump_conf, fdh); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; - - return rc; + return opal_fadump_build_cpu_notes(fadump_conf, fdh); } static void opal_fadump_region_show(struct fw_dump *fadump_conf, diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index b5853e9fcc3c..214f37788b2d 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -375,11 +375,8 @@ static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) } final_note(note_buf); - if (fdh) { - pr_debug("Updating elfcore header (%llx) with cpu notes\n", - fdh->elfcorehdr_addr); - fadump_update_elfcore_header(__va(fdh->elfcorehdr_addr)); - } + pr_debug("Updating elfcore header (%llx) with cpu notes\n", fadump_conf->elfcorehdr_addr); + fadump_update_elfcore_header((char *)fadump_conf->elfcorehdr_addr); return 0; error_out: @@ -389,14 +386,11 @@ error_out: } /* - * Validate and process the dump data stored by firmware before exporting - * it through '/proc/vmcore'. + * Validate and process the dump data stored by the firmware, and update + * the CPU notes of elfcorehdr. */ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) { - struct fadump_crash_info_header *fdh; - int rc = 0; - if (!fdm_active || !fadump_conf->fadumphdr_addr) return -EINVAL; @@ -415,25 +409,7 @@ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) return -EINVAL; } - /* Validate the fadump crash info header */ - fdh = __va(fadump_conf->fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - pr_err("Crash info header is not valid.\n"); - return -EINVAL; - } - - rc = rtas_fadump_build_cpu_notes(fadump_conf); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; - - return 0; + return rtas_fadump_build_cpu_notes(fadump_conf); } static void rtas_fadump_region_show(struct fw_dump *fadump_conf, From bc446c5acabadeb38b61b565535401c5dfdd1214 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 23 Apr 2024 01:29:31 +0530 Subject: [PATCH 22/85] powerpc/fadump: add hotplug_ready sysfs interface The elfcorehdr describes the CPUs and memory of the crashed kernel to the kernel that captures the dump, known as the second or fadump kernel. The elfcorehdr needs to be updated if the system's memory changes due to memory hotplug or online/offline events. Currently, memory hotplug events are monitored in userspace by udev rules, and fadump is re-registered, which recreates the elfcorehdr with the latest available memory in the system. However, the previous patch ("powerpc: make fadump resilient with memory add/remove events") moved the creation of elfcorehdr to the second or fadump kernel. This eliminates the need to regenerate the elfcorehdr during memory hotplug or online/offline events. Create a sysfs entry at /sys/kernel/fadump/hotplug_ready to let userspace know that fadump re-registration is not required for memory add/remove events. Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422195932.1583833-3-sourabhjain@linux.ibm.com --- Documentation/ABI/testing/sysfs-kernel-fadump | 11 +++++++++++ arch/powerpc/kernel/fadump.c | 14 ++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-fadump b/Documentation/ABI/testing/sysfs-kernel-fadump index 8f7a64a81783..c586054657d6 100644 --- a/Documentation/ABI/testing/sysfs-kernel-fadump +++ b/Documentation/ABI/testing/sysfs-kernel-fadump @@ -38,3 +38,14 @@ Contact: linuxppc-dev@lists.ozlabs.org Description: read only Provide information about the amount of memory reserved by FADump to save the crash dump in bytes. + +What: /sys/kernel/fadump/hotplug_ready +Date: Apr 2024 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + Kdump udev rule re-registers fadump on memory add/remove events, + primarily to update the elfcorehdr. This sysfs indicates the + kdump udev rule that fadump re-registration is not required on + memory add/remove events because elfcorehdr is now prepared in + the second/fadump kernel. +User: kexec-tools diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a020597c065a..2de7379d0f30 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1426,6 +1426,18 @@ static ssize_t enabled_show(struct kobject *kobj, return sprintf(buf, "%d\n", fw_dump.fadump_enabled); } +/* + * /sys/kernel/fadump/hotplug_ready sysfs node returns 1, which inidcates + * to usersapce that fadump re-registration is not required on memory + * hotplug events. + */ +static ssize_t hotplug_ready_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", 1); +} + static ssize_t mem_reserved_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -1498,11 +1510,13 @@ static struct kobj_attribute release_attr = __ATTR_WO(release_mem); static struct kobj_attribute enable_attr = __ATTR_RO(enabled); static struct kobj_attribute register_attr = __ATTR_RW(registered); static struct kobj_attribute mem_reserved_attr = __ATTR_RO(mem_reserved); +static struct kobj_attribute hotplug_ready_attr = __ATTR_RO(hotplug_ready); static struct attribute *fadump_attrs[] = { &enable_attr.attr, ®ister_attr.attr, &mem_reserved_attr.attr, + &hotplug_ready_attr.attr, NULL, }; From 57e6700145c5d1f49c52137e9163f73ec5441256 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 23 Apr 2024 01:29:32 +0530 Subject: [PATCH 23/85] Documentation/powerpc: update fadump implementation details The patch titled ("powerpc: make fadump resilient with memory add/remove events") has made significant changes to the implementation of fadump, particularly on elfcorehdr creation and fadump crash info header structure. Therefore, updating the fadump implementation documentation to reflect those changes. Following updates are done to firmware assisted dump documentation: 1. The elfcorehdr is no longer stored after fadump HDR in the reserved dump area. Instead, the second kernel dynamically allocates memory for the elfcorehdr within the address range from 0 to the boot memory size. Therefore, update figures 1 and 2 of Memory Reservation during the first and second kernels to reflect this change. 2. A version field has been added to the fadump header to manage the future changes to fadump crash info header structure without changing the fadump header magic number in the future. Therefore, remove the corresponding TODO from the document. Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422195932.1583833-4-sourabhjain@linux.ibm.com --- .../arch/powerpc/firmware-assisted-dump.rst | 91 +++++++++---------- 1 file changed, 42 insertions(+), 49 deletions(-) diff --git a/Documentation/arch/powerpc/firmware-assisted-dump.rst b/Documentation/arch/powerpc/firmware-assisted-dump.rst index e363fc48529a..7e37aadd1f77 100644 --- a/Documentation/arch/powerpc/firmware-assisted-dump.rst +++ b/Documentation/arch/powerpc/firmware-assisted-dump.rst @@ -134,12 +134,12 @@ that are run. If there is dump data, then the memory is held. If there is no waiting dump data, then only the memory required to -hold CPU state, HPTE region, boot memory dump, FADump header and -elfcore header, is usually reserved at an offset greater than boot -memory size (see Fig. 1). This area is *not* released: this region -will be kept permanently reserved, so that it can act as a receptacle -for a copy of the boot memory content in addition to CPU state and -HPTE region, in the case a crash does occur. +hold CPU state, HPTE region, boot memory dump, and FADump header is +usually reserved at an offset greater than boot memory size (see Fig. 1). +This area is *not* released: this region will be kept permanently +reserved, so that it can act as a receptacle for a copy of the boot +memory content in addition to CPU state and HPTE region, in the case +a crash does occur. Since this reserved memory area is used only after the system crash, there is no point in blocking this significant chunk of memory from @@ -153,22 +153,22 @@ that were present in CMA region:: o Memory Reservation during first kernel - Low memory Top of memory - 0 boot memory size |<--- Reserved dump area --->| | - | | | Permanent Reservation | | - V V | | V - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | | |///|////| DUMP | HDR | ELF |////| | - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | ^ ^ ^ ^ ^ - | | | | | | - \ CPU HPTE / | | - ------------------------------ | | - Boot memory content gets transferred | | - to reserved area by firmware at the | | - time of crash. | | - FADump Header | - (meta area) | + Low memory Top of memory + 0 boot memory size |<------ Reserved dump area ----->| | + | | | Permanent Reservation | | + V V | | V + +-----------+-----/ /---+---+----+-----------+-------+----+-----+ + | | |///|////| DUMP | HDR |////| | + +-----------+-----/ /---+---+----+-----------+-------+----+-----+ + | ^ ^ ^ ^ ^ + | | | | | | + \ CPU HPTE / | | + -------------------------------- | | + Boot memory content gets transferred | | + to reserved area by firmware at the | | + time of crash. | | + FADump Header | + (meta area) | | | Metadata: This area holds a metadata structure whose @@ -186,13 +186,20 @@ that were present in CMA region:: 0 boot memory size | | |<------------ Crash preserved area ------------>| V V |<--- Reserved dump area --->| | - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | | |///|////| DUMP | HDR | ELF |////| | - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | | - V V - Used by second /proc/vmcore - kernel to boot + +----+---+--+-----/ /---+---+----+-------+-----+-----+-------+ + | |ELF| | |///|////| DUMP | HDR |/////| | + +----+---+--+-----/ /---+---+----+-------+-----+-----+-------+ + | | | | | | + ----- ------------------------------ --------------- + \ | | + \ | | + \ | | + \ | ---------------------------- + \ | / + \ | / + \ | / + /proc/vmcore + +---+ |///| -> Regions (CPU, HPTE & Metadata) marked like this in the above @@ -200,6 +207,12 @@ that were present in CMA region:: does not have CPU & HPTE regions while Metadata region is not supported on pSeries currently. + +---+ + |ELF| -> elfcorehdr, it is created in second kernel after crash. + +---+ + + Note: Memory from 0 to the boot memory size is used by second kernel + Fig. 2 @@ -353,26 +366,6 @@ TODO: - Need to come up with the better approach to find out more accurate boot memory size that is required for a kernel to boot successfully when booted with restricted memory. - - The FADump implementation introduces a FADump crash info structure - in the scratch area before the ELF core header. The idea of introducing - this structure is to pass some important crash info data to the second - kernel which will help second kernel to populate ELF core header with - correct data before it gets exported through /proc/vmcore. The current - design implementation does not address a possibility of introducing - additional fields (in future) to this structure without affecting - compatibility. Need to come up with the better approach to address this. - - The possible approaches are: - - 1. Introduce version field for version tracking, bump up the version - whenever a new field is added to the structure in future. The version - field can be used to find out what fields are valid for the current - version of the structure. - 2. Reserve the area of predefined size (say PAGE_SIZE) for this - structure and have unused area as reserved (initialized to zero) - for future field additions. - - The advantage of approach 1 over 2 is we don't need to reserve extra space. Author: Mahesh Salgaonkar From d1679b4fa1722e6bb4a17b13aacdc01a130ba362 Mon Sep 17 00:00:00 2001 From: Ganesh Goudar Date: Mon, 22 Apr 2024 13:27:37 +0530 Subject: [PATCH 24/85] powerpc/eeh: Permanently disable the removed device When a device is hot removed on powernv, the hotplug driver clears the device's state. However, on pseries, if a device is removed by phyp after reaching the error threshold, the kernel remains unaware, leading to the device not being torn down. This prevents necessary remediation actions like failover. Permanently disable the device if the presence check fails. Also, in eeh_dev_check_failure in we may consider the error as false positive if the device is hotpluged out as the get_state call returns EEH_STATE_NOT_SUPPORT and we may end up not clearing the device state, so log the event if the state is not moved to permanent failure state. Signed-off-by: Ganesh Goudar Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422075737.1405551-1-ganeshgr@linux.ibm.com --- arch/powerpc/kernel/eeh.c | 11 ++++++++++- arch/powerpc/kernel/eeh_driver.c | 13 +++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index ab316e155ea9..6670063a7a6c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -506,9 +506,18 @@ int eeh_dev_check_failure(struct eeh_dev *edev) * We will punt with the following conditions: Failure to get * PE's state, EEH not support and Permanently unavailable * state, PE is in good state. + * + * On the pSeries, after reaching the threshold, get_state might + * return EEH_STATE_NOT_SUPPORT. However, it's possible that the + * device state remains uncleared if the device is not marked + * pci_channel_io_perm_failure. Therefore, consider logging the + * event to let device removal happen. + * */ if ((ret < 0) || - (ret == EEH_STATE_NOT_SUPPORT) || eeh_state_active(ret)) { + (ret == EEH_STATE_NOT_SUPPORT && + dev->error_state == pci_channel_io_perm_failure) || + eeh_state_active(ret)) { eeh_stats.false_positives++; pe->false_positives++; rc = 0; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 48773d2d9be3..7efe04c68f0f 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -865,9 +865,18 @@ void eeh_handle_normal_event(struct eeh_pe *pe) devices++; if (!devices) { - pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", + pr_warn("EEH: Frozen PHB#%x-PE#%x is empty!\n", pe->phb->global_number, pe->addr); - goto out; /* nothing to recover */ + /* + * The device is removed, tear down its state, on powernv + * hotplug driver would take care of it but not on pseries, + * permanently disable the card as it is hot removed. + * + * In the case of powernv, note that the removal of device + * is covered by pci rescan lock, so no problem even if hotplug + * driver attempts to remove the device. + */ + goto recover_failed; } /* Log the event */ From 4ccae23609f589dd69a593f457f76ee8b0e2d4e0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 19 Apr 2024 21:59:13 +1000 Subject: [PATCH 25/85] powerpc/dart: Drop unnecessary call to kmemleak_no_scan() Erhard reported that kmemleak was showing a warning at boot: kmemleak: Not scanning unknown object at 0xc00000007f000000 CPU: 0 PID: 0 Comm: swapper Not tainted 5.19.0-rc3-PMacG5+ #2 Call Trace: .dump_stack_lvl+0x7c/0xc4 (unreliable) .kmemleak_no_scan+0xe0/0x100 .iommu_init_early_dart+0x2f0/0x924 .pmac_probe+0x1b0/0x20c .setup_arch+0x1b8/0x674 .start_kernel+0xdc/0xb74 start_here_common+0x1c/0x44 DART table allocated at: (____ptrval____) Which he bisected to a change in kmemleak, commit 23c2d497de21 ("mm: kmemleak: take a full lowmem check in kmemleak_*_phys()"). Because pmac_probe() is called before mem_topology_setup(), the min/ max PFN variables are still zero. That causes kmemleak_alloc_phys() to ignore the allocation, because the checks against the PFN fail. Then kmemleak_no_scan() can't find the allocation and prints warning. Given that kmemleak_alloc_phys() is ignoring the allocation to begin with, there's no need to call kmemleak_no_scan() at all, which avoids the warning. Reported-by: Erhard Furtner Closes: https://lore.kernel.org/all/bug-216156-206035@https.bugzilla.kernel.org%2F/ Signed-off-by: Michael Ellerman Link: https://msgid.link/20240419115913.3317575-1-mpe@ellerman.id.au --- arch/powerpc/sysdev/dart_iommu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index 98096bbfd62e..c0d10c149661 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -243,9 +242,6 @@ static void __init allocate_dart(void) if (!dart_tablebase) panic("Failed to allocate 16MB below 2GB for DART table\n"); - /* There is no point scanning the DART space for leaks*/ - kmemleak_no_scan((void *)dart_tablebase); - /* Allocate a spare page to map all invalid DART pages. We need to do * that to work around what looks like a problem with the HT bridge * prefetching into invalid pages and corrupting data From ff2e185cf73df480ec69675936c4ee75a445c3e4 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 8 Apr 2024 09:08:31 -0500 Subject: [PATCH 26/85] powerpc/pseries: Enforce hcall result buffer validity and size plpar_hcall(), plpar_hcall9(), and related functions expect callers to provide valid result buffers of certain minimum size. Currently this is communicated only through comments in the code and the compiler has no idea. For example, if I write a bug like this: long retbuf[PLPAR_HCALL_BUFSIZE]; // should be PLPAR_HCALL9_BUFSIZE plpar_hcall9(H_ALLOCATE_VAS_WINDOW, retbuf, ...); This compiles with no diagnostics emitted, but likely results in stack corruption at runtime when plpar_hcall9() stores results past the end of the array. (To be clear this is a contrived example and I have not found a real instance yet.) To make this class of error less likely, we can use explicitly-sized array parameters instead of pointers in the declarations for the hcall APIs. When compiled with -Warray-bounds[1], the code above now provokes a diagnostic like this: error: array argument is too small; is of size 32, callee requires at least 72 [-Werror,-Warray-bounds] 60 | plpar_hcall9(H_ALLOCATE_VAS_WINDOW, retbuf, | ^ ~~~~~~ [1] Enabled for LLVM builds but not GCC for now. See commit 0da6e5fd6c37 ("gcc: disable '-Warray-bounds' for gcc-13 too") and related changes. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://msgid.link/20240408-pseries-hvcall-retbuf-v1-1-ebc73d7253cf@linux.ibm.com --- arch/powerpc/include/asm/hvcall.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 51172625fa3a..7a8495660c2f 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -524,7 +524,7 @@ long plpar_hcall_norets_notrace(unsigned long opcode, ...); * Used for all but the craziest of phyp interfaces (see plpar_hcall9) */ #define PLPAR_HCALL_BUFSIZE 4 -long plpar_hcall(unsigned long opcode, unsigned long *retbuf, ...); +long plpar_hcall(unsigned long opcode, unsigned long retbuf[static PLPAR_HCALL_BUFSIZE], ...); /** * plpar_hcall_raw: - Make a hypervisor call without calculating hcall stats @@ -538,7 +538,7 @@ long plpar_hcall(unsigned long opcode, unsigned long *retbuf, ...); * plpar_hcall, but plpar_hcall_raw works in real mode and does not * calculate hypervisor call statistics. */ -long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...); +long plpar_hcall_raw(unsigned long opcode, unsigned long retbuf[static PLPAR_HCALL_BUFSIZE], ...); /** * plpar_hcall9: - Make a pseries hypervisor call with up to 9 return arguments @@ -549,8 +549,8 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...); * PLPAR_HCALL9_BUFSIZE to size the return argument buffer. */ #define PLPAR_HCALL9_BUFSIZE 9 -long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...); -long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...); +long plpar_hcall9(unsigned long opcode, unsigned long retbuf[static PLPAR_HCALL9_BUFSIZE], ...); +long plpar_hcall9_raw(unsigned long opcode, unsigned long retbuf[static PLPAR_HCALL9_BUFSIZE], ...); /* pseries hcall tracing */ extern struct static_key hcall_tracepoint_key; From 29247de4ad753771afef95ace8af738d807ca279 Mon Sep 17 00:00:00 2001 From: Lidong Zhong Date: Thu, 11 Apr 2024 10:04:50 +0800 Subject: [PATCH 27/85] powerpc/pseries/vio: Don't return ENODEV if node or compatible missing We noticed the following nuisance messages during boot process: vio vio: uevent: failed to send synthetic uevent vio 4000: uevent: failed to send synthetic uevent vio 4001: uevent: failed to send synthetic uevent vio 4002: uevent: failedto send synthetic uevent vio 4004: uevent: failed to send synthetic uevent It's caused by either vio_register_device_node() failing to set dev->of_node or the node is missing a "compatible" property. To match the definition of modalias in modalias_show(), remove the return of ENODEV in such cases. The failure messages is also suppressed with this change. Signed-off-by: Lidong Zhong Signed-off-by: Michael Ellerman Link: https://msgid.link/20240411020450.12725-1-lidong.zhong@suse.com --- arch/powerpc/platforms/pseries/vio.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 90ff85c879bf..b2babfdbc40b 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1592,13 +1592,9 @@ static int vio_hotplug(const struct device *dev, struct kobj_uevent_env *env) const char *cp; dn = dev->of_node; - if (!dn) - return -ENODEV; - cp = of_get_property(dn, "compatible", NULL); - if (!cp) - return -ENODEV; + if (dn && (cp = of_get_property(dn, "compatible", NULL))) + add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp); - add_uevent_var(env, "MODALIAS=vio:T%sS%s", vio_dev->type, cp); return 0; } From 37496845c812db2a470d51088a59ee38156e8058 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 29 Feb 2024 15:07:09 +0530 Subject: [PATCH 28/85] selftests/powerpc: Re-order *FLAGS to follow lib.mk In some powerpc/ sub-folder Makefiles, CFLAGS are defined before lib.mk include. Clean it up by re-ordering the flags to follow after the mk include. This is needed to support sub-folders in powerpc/ buildable on its own. Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240229093711.581230-1-maddy@linux.ibm.com --- .../selftests/powerpc/benchmarks/Makefile | 4 ++-- .../selftests/powerpc/copyloops/Makefile | 20 +++++++++---------- .../selftests/powerpc/nx-gzip/Makefile | 4 ++-- .../selftests/powerpc/pmu/ebb/Makefile | 20 +++++++++---------- .../powerpc/pmu/event_code_tests/Makefile | 4 ++-- .../powerpc/pmu/sampling_tests/Makefile | 4 ++-- .../selftests/powerpc/primitives/Makefile | 4 ++-- .../selftests/powerpc/security/Makefile | 4 ++-- .../testing/selftests/powerpc/signal/Makefile | 3 ++- .../selftests/powerpc/stringloops/Makefile | 10 +++++----- .../selftests/powerpc/switch_endian/Makefile | 4 ++-- .../selftests/powerpc/syscalls/Makefile | 4 ++-- tools/testing/selftests/powerpc/vphn/Makefile | 4 ++-- 13 files changed, 45 insertions(+), 44 deletions(-) diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index a32a6ab89914..75f5232c3aec 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -4,11 +4,11 @@ TEST_GEN_FILES := exec_target TEST_FILES := settings -CFLAGS += -O2 - top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -O2 + $(TEST_GEN_PROGS): ../harness.c $(OUTPUT)/context_switch: ../utils.c diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 77594e697f2f..72684ed589c0 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -1,14 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -m64 -CFLAGS += -I$(CURDIR) -CFLAGS += -D SELFTEST -CFLAGS += -maltivec -CFLAGS += -mcpu=power4 - -# Use our CFLAGS for the implicit .S rule & set the asm machine type -ASFLAGS = $(CFLAGS) -Wa,-mpower4 - TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ copyuser_p7_t0 copyuser_p7_t1 \ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \ @@ -21,6 +11,16 @@ EXTRA_SOURCES := validate.c ../harness.c stubs.S top_srcdir = ../../../../.. include ../../lib.mk +# The loops are all 64-bit code +CFLAGS += -m64 +CFLAGS += -I$(CURDIR) +CFLAGS += -D SELFTEST +CFLAGS += -maltivec +CFLAGS += -mcpu=power4 + +# Use our CFLAGS for the implicit .S rule & set the asm machine type +ASFLAGS = $(CFLAGS) -Wa,-mpower4 + $(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) $(CC) $(CPPFLAGS) $(CFLAGS) \ -D COPY_LOOP=test___copy_tofrom_user_base \ diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index 0785c2e99d40..b40991f902b2 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -1,8 +1,8 @@ -CFLAGS = -O3 -m64 -I./include -I../include - TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk +CFLAGS = -O3 -m64 -I./include -I../include + $(TEST_GEN_FILES): gzip_vas.c ../utils.c diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index 010160690227..b3946ce17e0c 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -4,16 +4,6 @@ include ../../../../../build/Build.include noarg: $(MAKE) -C ../../ -# The EBB handler is 64-bit code and everything links against it -CFLAGS += -m64 - -TMPOUT = $(OUTPUT)/TMPDIR/ -# Toolchains may build PIE by default which breaks the assembly -no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ - $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) - -LDFLAGS += $(no-pie-option) - TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ cycles_with_freeze_test pmc56_overflow_test \ ebb_vs_cpu_event_test cpu_event_vs_ebb_test \ @@ -29,6 +19,16 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ top_srcdir = ../../../../../.. include ../../../lib.mk +# The EBB handler is 64-bit code and everything links against it +CFLAGS += -m64 + +TMPOUT = $(OUTPUT)/TMPDIR/ +# Toolchains may build PIE by default which breaks the assembly +no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) + +LDFLAGS += $(no-pie-option) + $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \ ebb.c ebb_handler.S trace.c busy_loop.S diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 4e07d7046457..509d4b235b9e 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_test group_constraint_pmc_count_test \ group_constraint_repeat_test group_constraint_radix_scope_qual_test reserved_bits_mmcra_sample_elig_mode_test \ group_constraint_mmcra_sample_test invalid_event_code_test reserved_bits_mmcra_thresh_ctl_test \ @@ -12,4 +10,6 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te top_srcdir = ../../../../../.. include ../../../lib.mk +CFLAGS += -m64 + $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c ../sampling_tests/misc.h ../sampling_tests/misc.c diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index 9e67351fb252..d45892151e05 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -CFLAGS += -m64 - TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test \ mmcr0_pmcjce_test mmcr0_fc56_pmc1ce_test mmcr0_fc56_pmc56_test \ mmcr1_comb_test mmcr2_l2l3_test mmcr2_fcs_fch_test \ @@ -12,4 +10,6 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk +CFLAGS += -m64 + $(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c misc.c misc.h ../loop.S ../branch_loops.S diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 9b9491a63213..6dc5c5a42ca9 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -I$(CURDIR) - TEST_GEN_PROGS := load_unaligned_zeropad top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -I$(CURDIR) + $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index e0d979ab0204..0a08386be969 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -5,10 +5,10 @@ TEST_PROGS := mitigation-patching.sh top_srcdir = ../../../../.. -CFLAGS += $(KHDR_INCLUDES) - include ../../lib.mk +CFLAGS += $(KHDR_INCLUDES) + $(TEST_GEN_PROGS): ../harness.c ../utils.c $(OUTPUT)/spectre_v2: CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index f679d260afc8..b15d5dbccc24 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -3,7 +3,6 @@ TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart TEST_GEN_PROGS += sigreturn_kernel TEST_GEN_PROGS += sigreturn_unaligned -CFLAGS += -maltivec $(OUTPUT)/signal_tm: CFLAGS += -mhtm $(OUTPUT)/sigfuz: CFLAGS += -pthread -m64 @@ -12,4 +11,6 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -maltivec + $(TEST_GEN_PROGS): ../harness.c ../utils.c signal.S diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 9c39f55a58ff..87c8c8f238da 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# The loops are all 64-bit code -CFLAGS += -I$(CURDIR) - EXTRA_SOURCES := ../harness.c build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c >/dev/null 2>&1) then echo "1"; fi) @@ -27,9 +24,12 @@ $(OUTPUT)/strlen_32: CFLAGS += -m32 TEST_GEN_PROGS += strlen_32 endif -ASFLAGS = $(CFLAGS) - top_srcdir = ../../../../.. include ../../lib.mk +# The loops are all 64-bit code +CFLAGS += -I$(CURDIR) + +ASFLAGS = $(CFLAGS) + $(TEST_GEN_PROGS): $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index bdc081afedb0..8f0c2a1d3333 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -1,13 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := switch_endian_test -ASFLAGS += -O2 -Wall -g -nostdlib -m64 - EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +ASFLAGS += -O2 -Wall -g -nostdlib -m64 + $(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT) $(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index ee1740ddfb0c..83dc33500773 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := ipc_unmuxed rtas_filter -CFLAGS += $(KHDR_INCLUDES) - top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += $(KHDR_INCLUDES) + $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index cf65cbf33085..ddc09a20b80f 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -1,10 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only TEST_GEN_PROGS := test-vphn -CFLAGS += -m64 -I$(CURDIR) - top_srcdir = ../../../../.. include ../../lib.mk +CFLAGS += -m64 -I$(CURDIR) + $(TEST_GEN_PROGS): ../harness.c From 5553a79387e92ffd812a49fdcf679f392281f6a9 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 29 Feb 2024 15:07:10 +0530 Subject: [PATCH 29/85] selftests/powerpc: Add flags.mk to support pmu buildable When running `make -C powerpc/pmu run_tests` from top level selftests directory, currently this error is being reported: make: Entering directory '/home/maddy/linux/tools/testing/selftests/powerpc/pmu' Makefile:40: warning: overriding recipe for target 'emit_tests' ../../lib.mk:111: warning: ignoring old recipe for target 'emit_tests' gcc -m64 count_instructions.c ../harness.c event.c lib.c ../utils.c loop.S -o /home/maddy/selftest_output//count_instructions In file included from count_instructions.c:13: event.h:12:10: fatal error: utils.h: No such file or directory 12 | #include "utils.h" | ^~~~~~~~~ compilation terminated. This is due to missing of include path in CFLAGS. That is, CFLAGS and GIT_VERSION macros are defined in the powerpc/ folder Makefile which in this case is not involved. To address the failure in case of executing specific sub-folder test directly, a new rule file has been addded by the patch called "flags.mk" under selftest/powerpc/ folder and is linked to all the Makefile of powerpc/pmu sub-folders. Reported-by: Sachin Sant Signed-off-by: Madhavan Srinivasan Tested-by: Sachin Sant [mpe: Fixup ifeq, make GIT_VERSION simply expanded to avoid re-executing git describe] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240229093711.581230-2-maddy@linux.ibm.com --- tools/testing/selftests/powerpc/flags.mk | 12 ++++++++++++ tools/testing/selftests/powerpc/pmu/Makefile | 1 + tools/testing/selftests/powerpc/pmu/ebb/Makefile | 1 + .../selftests/powerpc/pmu/event_code_tests/Makefile | 1 + .../selftests/powerpc/pmu/sampling_tests/Makefile | 1 + 5 files changed, 16 insertions(+) create mode 100644 tools/testing/selftests/powerpc/flags.mk diff --git a/tools/testing/selftests/powerpc/flags.mk b/tools/testing/selftests/powerpc/flags.mk new file mode 100644 index 000000000000..b909bee3cb2a --- /dev/null +++ b/tools/testing/selftests/powerpc/flags.mk @@ -0,0 +1,12 @@ +#This checks for any ENV variables and add those. + +ifeq ($(GIT_VERSION),) +GIT_VERSION := $(shell git describe --always --long --dirty || echo "unknown") +export GIT_VERSION +endif + +ifeq ($(CFLAGS),) +CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(selfdir)/powerpc/include $(CFLAGS) +export CFLAGS +endif + diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index a284fa874a9f..1fcacae1b188 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -7,6 +7,7 @@ EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile index b3946ce17e0c..1b39af7c10db 100644 --- a/tools/testing/selftests/powerpc/pmu/ebb/Makefile +++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile @@ -18,6 +18,7 @@ TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \ top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk # The EBB handler is 64-bit code and everything links against it CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile index 509d4b235b9e..fdb080b3fa65 100644 --- a/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/event_code_tests/Makefile @@ -9,6 +9,7 @@ TEST_GEN_PROGS := group_constraint_pmc56_test group_pmc56_exclude_constraints_te top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile index d45892151e05..9f79bec5fce7 100644 --- a/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile +++ b/tools/testing/selftests/powerpc/pmu/sampling_tests/Makefile @@ -9,6 +9,7 @@ TEST_GEN_PROGS := mmcr0_exceptionbits_test mmcr0_cc56run_test mmcr0_pmccext_test top_srcdir = ../../../../../.. include ../../../lib.mk +include ../../flags.mk CFLAGS += -m64 From 108e5e683333615023265a9a73a29d4c2fa16c70 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Thu, 29 Feb 2024 15:07:11 +0530 Subject: [PATCH 30/85] selftests/powerpc: make sub-folders buildable on their own Build breaks when executing make with run_tests for sub-folders under powerpc. This is because, CFLAGS and GIT_VERSION macros are defined in Makefile of toplevel powerpc folder. make: Entering directory '/home/maddy/linux/tools/testing/selftests/powerpc/mm' gcc hugetlb_vs_thp_test.c ../harness.c ../utils.c -o /home/maddy/selftest_output//hugetlb_vs_thp_test hugetlb_vs_thp_test.c:6:10: fatal error: utils.h: No such file or directory 6 | #include "utils.h" | ^~~~~~~~~ compilation terminated. Fix this by adding the flags.mk in each sub-folder Makefile. Also remove the CFLAGS and GIT_VERSION macros from powerpc/ folder Makefile since the same is definied in flags.mk Signed-off-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240229093711.581230-3-maddy@linux.ibm.com --- tools/testing/selftests/powerpc/Makefile | 7 +------ tools/testing/selftests/powerpc/alignment/Makefile | 1 + tools/testing/selftests/powerpc/benchmarks/Makefile | 1 + tools/testing/selftests/powerpc/cache_shape/Makefile | 1 + tools/testing/selftests/powerpc/copyloops/Makefile | 1 + tools/testing/selftests/powerpc/dexcr/Makefile | 1 + tools/testing/selftests/powerpc/dscr/Makefile | 1 + tools/testing/selftests/powerpc/eeh/Makefile | 1 + tools/testing/selftests/powerpc/math/Makefile | 1 + tools/testing/selftests/powerpc/mce/Makefile | 1 + tools/testing/selftests/powerpc/mm/Makefile | 1 + tools/testing/selftests/powerpc/nx-gzip/Makefile | 1 + tools/testing/selftests/powerpc/papr_attributes/Makefile | 3 ++- tools/testing/selftests/powerpc/papr_sysparm/Makefile | 1 + tools/testing/selftests/powerpc/papr_vpd/Makefile | 1 + tools/testing/selftests/powerpc/primitives/Makefile | 1 + tools/testing/selftests/powerpc/ptrace/Makefile | 1 + tools/testing/selftests/powerpc/security/Makefile | 1 + tools/testing/selftests/powerpc/signal/Makefile | 1 + tools/testing/selftests/powerpc/stringloops/Makefile | 1 + tools/testing/selftests/powerpc/switch_endian/Makefile | 1 + tools/testing/selftests/powerpc/syscalls/Makefile | 1 + tools/testing/selftests/powerpc/tm/Makefile | 1 + tools/testing/selftests/powerpc/vphn/Makefile | 1 + 24 files changed, 25 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index c376151982c4..2f299fd04d2d 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -7,12 +7,6 @@ ARCH := $(shell echo $(ARCH) | sed -e s/ppc.*/powerpc/) ifeq ($(ARCH),powerpc) -GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown") - -CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR)/include $(CFLAGS) - -export CFLAGS - SUB_DIRS = alignment \ benchmarks \ cache_shape \ @@ -46,6 +40,7 @@ $(SUB_DIRS): BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all include ../lib.mk +include ./flags.mk override define RUN_TESTS +@for TARGET in $(SUB_DIRS); do \ diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile index 93e9af37449d..66d5d7aaeb20 100644 --- a/tools/testing/selftests/powerpc/alignment/Makefile +++ b/tools/testing/selftests/powerpc/alignment/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := copy_first_unaligned alignment_handler top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index 75f5232c3aec..1321922038d0 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -6,6 +6,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -O2 diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 689f6c8ebcd8..3a3ca956ac66 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := cache_shape top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 72684ed589c0..42940f92d832 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -10,6 +10,7 @@ EXTRA_SOURCES := validate.c ../harness.c stubs.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk # The loops are all 64-bit code CFLAGS += -m64 diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 76210f2bcec3..523947a38d17 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -2,6 +2,7 @@ TEST_GEN_PROGS := hashchk_test TEST_GEN_FILES := lsdexcr include ../../lib.mk +include ../flags.mk $(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 9289d5febe1e..9fa9cb5bd989 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -5,6 +5,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread $(OUTPUT)/dscr_explicit_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile index ae963eb2dc5b..70797716f2b5 100644 --- a/tools/testing/selftests/powerpc/eeh/Makefile +++ b/tools/testing/selftests/powerpc/eeh/Makefile @@ -7,3 +7,4 @@ TEST_FILES := eeh-functions.sh settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile index 3948f7c510aa..b14fd2e0c6a8 100644 --- a/tools/testing/selftests/powerpc/math/Makefile +++ b/tools/testing/selftests/powerpc/math/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal fpu_denormal vmx_syscall vm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c $(TEST_GEN_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec diff --git a/tools/testing/selftests/powerpc/mce/Makefile b/tools/testing/selftests/powerpc/mce/Makefile index 2424513982d9..ce4ed679aaaf 100644 --- a/tools/testing/selftests/powerpc/mce/Makefile +++ b/tools/testing/selftests/powerpc/mce/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := inject-ra-err include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 4a6608beef0e..aab058ecb352 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -13,6 +13,7 @@ TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile index b40991f902b2..480d8ba94cf7 100644 --- a/tools/testing/selftests/powerpc/nx-gzip/Makefile +++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile @@ -2,6 +2,7 @@ TEST_GEN_FILES := gzfht_test gunz_test TEST_PROGS := nx-gzip-test.sh include ../../lib.mk +include ../flags.mk CFLAGS = -O3 -m64 -I./include -I../include diff --git a/tools/testing/selftests/powerpc/papr_attributes/Makefile b/tools/testing/selftests/powerpc/papr_attributes/Makefile index e899712d49db..406429499022 100644 --- a/tools/testing/selftests/powerpc/papr_attributes/Makefile +++ b/tools/testing/selftests/powerpc/papr_attributes/Makefile @@ -3,5 +3,6 @@ TEST_GEN_PROGS := attr_test top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk -$(TEST_GEN_PROGS): ../harness.c ../utils.c \ No newline at end of file +$(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_sysparm/Makefile b/tools/testing/selftests/powerpc/papr_sysparm/Makefile index 7f79e437634a..fed4f2414dbf 100644 --- a/tools/testing/selftests/powerpc/papr_sysparm/Makefile +++ b/tools/testing/selftests/powerpc/papr_sysparm/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_sysparm top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/papr_vpd/Makefile b/tools/testing/selftests/powerpc/papr_vpd/Makefile index 06b719703bfd..b09852e40882 100644 --- a/tools/testing/selftests/powerpc/papr_vpd/Makefile +++ b/tools/testing/selftests/powerpc/papr_vpd/Makefile @@ -6,6 +6,7 @@ TEST_GEN_PROGS := papr_vpd top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile index 6dc5c5a42ca9..23bd9a7590dd 100644 --- a/tools/testing/selftests/powerpc/primitives/Makefile +++ b/tools/testing/selftests/powerpc/primitives/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := load_unaligned_zeropad top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -I$(CURDIR) diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile index 1b39b86849da..59ca01d8567e 100644 --- a/tools/testing/selftests/powerpc/ptrace/Makefile +++ b/tools/testing/selftests/powerpc/ptrace/Makefile @@ -26,6 +26,7 @@ LOCAL_HDRS += $(patsubst %,$(selfdir)/powerpc/ptrace/%,$(wildcard *.h)) top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk TM_TESTS := $(patsubst %,$(OUTPUT)/%,$(TM_TESTS)) TESTS_64 := $(patsubst %,$(OUTPUT)/%,$(TESTS_64)) diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index 0a08386be969..33286039724a 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -6,6 +6,7 @@ TEST_PROGS := mitigation-patching.sh top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile index b15d5dbccc24..ece95bd52be9 100644 --- a/tools/testing/selftests/powerpc/signal/Makefile +++ b/tools/testing/selftests/powerpc/signal/Makefile @@ -10,6 +10,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -maltivec diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 87c8c8f238da..4c9d9a58c9d1 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -26,6 +26,7 @@ endif top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk # The loops are all 64-bit code CFLAGS += -I$(CURDIR) diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile index 8f0c2a1d3333..0da2e0a74264 100644 --- a/tools/testing/selftests/powerpc/switch_endian/Makefile +++ b/tools/testing/selftests/powerpc/switch_endian/Makefile @@ -5,6 +5,7 @@ EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk ASFLAGS += -O2 -Wall -g -nostdlib -m64 diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile index 83dc33500773..3bc07af88f0e 100644 --- a/tools/testing/selftests/powerpc/syscalls/Makefile +++ b/tools/testing/selftests/powerpc/syscalls/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := ipc_unmuxed rtas_filter top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += $(KHDR_INCLUDES) diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile index 3876805c2f31..f13f0ab36007 100644 --- a/tools/testing/selftests/powerpc/tm/Makefile +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -11,6 +11,7 @@ TEST_FILES := settings top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile index ddc09a20b80f..61d519a076c6 100644 --- a/tools/testing/selftests/powerpc/vphn/Makefile +++ b/tools/testing/selftests/powerpc/vphn/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := test-vphn top_srcdir = ../../../../.. include ../../lib.mk +include ../flags.mk CFLAGS += -m64 -I$(CURDIR) From 822a04957cc5e675570645f506270797a1cf2865 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 22 Apr 2024 23:34:52 +1000 Subject: [PATCH 31/85] selftests/powerpc: Convert pmu Makefile to for loop style The pmu Makefile has grown more sub directories over the years. Rather than open coding the rules for each subdir, use for loops. Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422133453.1793988-1-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/pmu/Makefile | 43 ++++++++++---------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 1fcacae1b188..773933e5180e 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -9,7 +9,9 @@ top_srcdir = ../../../../.. include ../../lib.mk include ../flags.mk -all: $(TEST_GEN_PROGS) ebb sampling_tests event_code_tests +SUB_DIRS := ebb sampling_tests event_code_tests + +all: $(TEST_GEN_PROGS) $(SUB_DIRS) $(TEST_GEN_PROGS): $(EXTRA_SOURCES) @@ -23,12 +25,16 @@ $(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES) $(OUTPUT)/per_event_excludes: ../utils.c +$(SUB_DIRS): + BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all + DEFAULT_RUN_TESTS := $(RUN_TESTS) override define RUN_TESTS $(DEFAULT_RUN_TESTS) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests; \ + done; endef emit_tests: @@ -36,34 +42,29 @@ emit_tests: BASENAME_TEST=`basename $$TEST`; \ echo "$(COLLECTION):$$BASENAME_TEST"; \ done - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests; \ + done; DEFAULT_INSTALL_RULE := $(INSTALL_RULE) override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install; \ + done; endef DEFAULT_CLEAN := $(CLEAN) override define CLEAN $(DEFAULT_CLEAN) $(RM) $(TEST_GEN_PROGS) $(OUTPUT)/loop.o - +TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=sampling_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean - +TARGET=event_code_tests; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean + +@for TARGET in $(SUB_DIRS); do \ + BUILD_TARGET=$(OUTPUT)/$$TARGET; \ + $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean; \ + done; endef -ebb: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -sampling_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all - -event_code_tests: - TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all .PHONY: all run_tests ebb sampling_tests event_code_tests emit_tests From dda32e37d397f5937cc24a6e98b71d3645f51afa Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 22 Apr 2024 23:34:53 +1000 Subject: [PATCH 32/85] selftests/powerpc: Install tests in sub-directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sources for the powerpc selftests are arranged into sub-directories. However when the tests are built and installed, the sub-directories are squashed, losing the structure. For example, with the current code the result of installing the selftests is: $ tree tools/testing/selftests/kselftest_install tools/testing/selftests/kselftest_install ├── kselftest │   ├── ktap_helpers.sh │   ├── module.sh │   ├── prefix.pl │   └── runner.sh ├── kselftest-list.txt ├── powerpc │   ├── alignment_handler │   ├── attr_test │   ├── back_to_back_ebbs_test │   ├── bad_accesses │   ├── bhrb_filter_map_test │   ├── bhrb_no_crash_wo_pmu_test │   ├── blacklisted_events_test │   ├── cache_shape │   ├── close_clears_pmcc_test │   ├── context_switch │   ├── copy_first_unaligned ... │   ├── settings ... │   └── wild_bctr └── run_kselftest.sh All the powerpc tests are squashed into the single powerpc directory. In particular, note that there is a single `settings` file, even though there are multiple settings files in the powerpc selftest sources. One of the settings files ends up installed, depending on install order, even if they have different contents. Similarly if there were two tests with the same name in different sub-directories they would clobber each other. Fix it by replicating the directory structure of the source tree into the install directory. The result being for example: $ tree tools/testing/selftests/kselftest_install tools/testing/selftests/kselftest_install ├── kselftest │   ├── ktap_helpers.sh │   ├── module.sh │   ├── prefix.pl │   └── runner.sh ├── kselftest-list.txt ├── powerpc │   ├── alignment │   │   ├── alignment_handler │   │   └── copy_first_unaligned │   ├── benchmarks │   │   ├── context_switch │   │   ├── exec_target │   │   ├── fork │   │   ├── futex_bench │   │   ├── gettimeofday │   │   ├── mmap_bench │   │   ├── null_syscall │   │   └── settings ... │   ├── eeh │   │   ├── eeh-basic.sh │   │   ├── eeh-functions.sh │   │   └── settings ... │   └── vphn │   └── test-vphn └── run_kselftest.sh Note multiple settings files in different sub-directories. This change also has the effect of changing the names of the tests from the point of view of the kselftest runner. Before the tests are named eg: powerpc:copy_first_unaligned powerpc:cache_shape powerpc:reg_access_test After, the test collection names include the sub-directory: powerpc/alignment:copy_first_unaligned powerpc/cache_shape:cache_shape powerpc/pmu/ebb:reg_access_test That means whereas previously all powerpc tests could be run with: $ ./run_kselftest.sh -c powerpc After the change it's necessary to pass a regex that matches all powerpc entries, eg: $ ./run_kselftest.sh -c "powerpc.*" The latter form also works before and after the change. Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422133453.1793988-2-mpe@ellerman.id.au --- tools/testing/selftests/powerpc/Makefile | 4 ++-- tools/testing/selftests/powerpc/pmu/Makefile | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 2f299fd04d2d..b175e94e1901 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -52,14 +52,14 @@ endef override define INSTALL_RULE +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install;\ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install;\ done; endef emit_tests: +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET $@;\ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET $@;\ done; override define CLEAN diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile index 773933e5180e..7e9dbf3d0d09 100644 --- a/tools/testing/selftests/powerpc/pmu/Makefile +++ b/tools/testing/selftests/powerpc/pmu/Makefile @@ -44,7 +44,7 @@ emit_tests: done +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests; \ + $(MAKE) OUTPUT=$$BUILD_TARGET COLLECTION=$(COLLECTION)/$$TARGET -s -C $$TARGET emit_tests; \ done; DEFAULT_INSTALL_RULE := $(INSTALL_RULE) @@ -52,7 +52,7 @@ override define INSTALL_RULE $(DEFAULT_INSTALL_RULE) +@for TARGET in $(SUB_DIRS); do \ BUILD_TARGET=$(OUTPUT)/$$TARGET; \ - $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install; \ + $(MAKE) OUTPUT=$$BUILD_TARGET INSTALL_PATH=$$INSTALL_PATH/$$TARGET -C $$TARGET install; \ done; endef From 84030aacf127d000180fa3cb4b589d8ab1b0d46b Mon Sep 17 00:00:00 2001 From: sundar Date: Wed, 24 Apr 2024 20:37:18 +0530 Subject: [PATCH 33/85] macintosh/macio-adb: replace of_node_put() with __free use the new cleanup magic to replace of_node_put() with __free(device_node) marking to auto release when they get out of scope. Suggested-by: Julia Lawall Signed-off-by: sundar Signed-off-by: Michael Ellerman Link: https://msgid.link/20240424150718.5006-1-prosunofficial@gmail.com --- drivers/macintosh/macio-adb.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c index 779f1268286e..19c63959ebed 100644 --- a/drivers/macintosh/macio-adb.c +++ b/drivers/macintosh/macio-adb.c @@ -83,35 +83,32 @@ struct adb_driver macio_adb_driver = { int macio_probe(void) { - struct device_node *np; + struct device_node *np __free(device_node) = + of_find_compatible_node(NULL, "adb", "chrp,adb0"); - np = of_find_compatible_node(NULL, "adb", "chrp,adb0"); - if (np) { - of_node_put(np); + if (np) return 0; - } + return -ENODEV; } int macio_init(void) { - struct device_node *adbs; + struct device_node *adbs __free(device_node) = + of_find_compatible_node(NULL, "adb", "chrp,adb0"); struct resource r; unsigned int irq; - adbs = of_find_compatible_node(NULL, "adb", "chrp,adb0"); if (!adbs) return -ENXIO; - if (of_address_to_resource(adbs, 0, &r)) { - of_node_put(adbs); + if (of_address_to_resource(adbs, 0, &r)) return -ENXIO; - } + adb = ioremap(r.start, sizeof(struct adb_regs)); - if (!adb) { - of_node_put(adbs); + if (!adb) return -ENOMEM; - } + out_8(&adb->ctrl.r, 0); out_8(&adb->intr.r, 0); @@ -121,7 +118,6 @@ int macio_init(void) out_8(&adb->autopoll.r, APE); irq = irq_of_parse_and_map(adbs, 0); - of_node_put(adbs); if (request_irq(irq, macio_adb_interrupt, 0, "ADB", (void *)0)) { iounmap(adb); printk(KERN_ERR "ADB: can't get irq %d\n", irq); From 236a4c63491784ae4814100cca47bc3645c776df Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 22 Apr 2024 21:52:31 +1000 Subject: [PATCH 34/85] powerpc: Mark memory_limit as initdata The `memory_limit` variable should only be used during boot, enforce that by marking it initdata. Signed-off-by: Michael Ellerman Link: https://msgid.link/20240422115231.1769984-1-mpe@ellerman.id.au --- arch/powerpc/mm/mem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 3a440004b97d..12316ac66e7e 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -30,7 +30,7 @@ #include -unsigned long long memory_limit; +unsigned long long memory_limit __initdata; unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); From 6a3e640b5dcf56fb44d66d525e01ea08633c6b8b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 30 Apr 2024 14:42:28 +1000 Subject: [PATCH 35/85] MAINTAINERS: powerpc: Remove Aneesh Aneesh is stepping down from powerpc maintenance. Acked-by: Aneesh Kumar K.V (Arm) Signed-off-by: Michael Ellerman Link: https://msgid.link/20240430044228.49015-1-mpe@ellerman.id.au --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..93af33ee8541 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12478,7 +12478,6 @@ LINUX FOR POWERPC (32-BIT AND 64-BIT) M: Michael Ellerman R: Nicholas Piggin R: Christophe Leroy -R: Aneesh Kumar K.V R: Naveen N. Rao L: linuxppc-dev@lists.ozlabs.org S: Supported From 1fcd254733371cfa5a3602bab5ae2c9dc4bf69e6 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 30 Apr 2024 14:43:27 +1000 Subject: [PATCH 36/85] MAINTAINERS: MMU GATHER: Update Aneesh's address Aneesh's IBM address no longer works, switch to his preferred kernel.org address. Acked-by: Aneesh Kumar K.V (Arm) Signed-off-by: Michael Ellerman Link: https://msgid.link/20240430044327.49363-1-mpe@ellerman.id.au --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 93af33ee8541..f096c9fff5b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14898,7 +14898,7 @@ F: drivers/phy/marvell/phy-pxa-usb.c MMU GATHER AND TLB INVALIDATION M: Will Deacon -M: "Aneesh Kumar K.V" +M: "Aneesh Kumar K.V" M: Andrew Morton M: Nick Piggin M: Peter Zijlstra From 9803af291162dbca4b9773586a3f5c392f0dd974 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Thu, 2 May 2024 23:50:40 +0530 Subject: [PATCH 37/85] powerpc/crash: remove unnecessary NULL check before kvfree() Fix the following coccicheck build warning: arch/powerpc/kexec/crash.c:488:2-8: WARNING: NULL check before some freeing functions is not needed. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202404261048.skfV5DDB-lkp@intel.com/ Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20240502182040.774759-1-sourabhjain@linux.ibm.com --- arch/powerpc/kexec/crash.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index 21b193e938a3..9ac3266e4965 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -484,8 +484,7 @@ static void update_crash_elfcorehdr(struct kimage *image, struct memory_notify * } out: kvfree(cmem); - if (elfbuf) - kvfree(elfbuf); + kvfree(elfbuf); } /** From 4071739249fd2e647e7058dbab0db4ddc0a0c427 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 3 May 2024 01:23:17 +0100 Subject: [PATCH 38/85] powerpc/module: Remove arch specific module bug stuff The last function to reference module_bug_list went in 2008's commit b9754568ef17 ("powerpc: Remove dead module_find_bug code") but I don't think that was called since 2006's commit 73c9ceab40b1 ("[POWERPC] Generic BUG for powerpc") Now that the list has gone, I think we can also clean up the bug entries in mod_arch_specific. Lightly boot tested. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Michael Ellerman Link: https://msgid.link/20240503002317.183500-1-linux@treblig.org --- arch/powerpc/include/asm/module.h | 5 ----- arch/powerpc/kernel/module.c | 2 -- 2 files changed, 7 deletions(-) diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index a8e2e8339fb7..300c777cc307 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -48,11 +48,6 @@ struct mod_arch_specific { unsigned long tramp; unsigned long tramp_regs; #endif - - /* List of BUG addresses, source line numbers and filenames */ - struct list_head bug_list; - struct bug_entry *bug_table; - unsigned int num_bugs; }; /* diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c index f6d6ae0a1692..8989e069e3aa 100644 --- a/arch/powerpc/kernel/module.c +++ b/arch/powerpc/kernel/module.c @@ -17,8 +17,6 @@ #include #include -static LIST_HEAD(module_bug_list); - static const Elf_Shdr *find_section(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, const char *name) From d7228a58d9438d6f219dc7f33eab0d1980b3bd2f Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:17 +1000 Subject: [PATCH 39/85] selftests/powerpc/dexcr: Add -no-pie to hashchk tests The hashchk tests want to verify that the hash key is changed over exec. It does so by calculating hashes at the same address across an exec. This is made simpler by disabling PIE functionality, so we can re-execute ourselves and be using the same addresses in the child. While -fno-pie is already added, -no-pie is also required. Fixes: bdb07f35a52f ("selftests/powerpc/dexcr: Add hashst/hashchk test") Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-2-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dexcr/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 523947a38d17..1b775959f981 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -4,7 +4,7 @@ TEST_GEN_FILES := lsdexcr include ../../lib.mk include ../flags.mk -$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie $(call cc-option,-mno-rop-protect) +$(OUTPUT)/hashchk_test: CFLAGS += -fno-pie -no-pie $(call cc-option,-mno-rop-protect) $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c $(TEST_GEN_FILES): ../utils.c ./dexcr.c From 75171f06c4507c3b6b5a69d793879fb20d108bb1 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:18 +1000 Subject: [PATCH 40/85] powerpc/dexcr: Track the DEXCR per-process Add capability to make the DEXCR act as a per-process SPR. We do not yet have an interface for changing the values per task. We also expect the kernel to use a single DEXCR value across all tasks while in privileged state, so there is no need to synchronize after changing it (the userspace aspects will synchronize upon returning to userspace). Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-3-bgray@linux.ibm.com --- arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/kernel/process.c | 10 ++++++++++ arch/powerpc/kernel/ptrace/ptrace-view.c | 7 +------ 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index b2c51d337e60..882e31296ea6 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -260,6 +260,7 @@ struct thread_struct { unsigned long sier2; unsigned long sier3; unsigned long hashkeyr; + unsigned long dexcr; #endif }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 9452a54d356c..d482c3fd81d7 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1185,6 +1185,9 @@ static inline void save_sprs(struct thread_struct *t) if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) t->hashkeyr = mfspr(SPRN_HASHKEYR); + + if (cpu_has_feature(CPU_FTR_ARCH_31)) + t->dexcr = mfspr(SPRN_DEXCR); #endif } @@ -1267,6 +1270,10 @@ static inline void restore_sprs(struct thread_struct *old_thread, if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE) && old_thread->hashkeyr != new_thread->hashkeyr) mtspr(SPRN_HASHKEYR, new_thread->hashkeyr); + + if (cpu_has_feature(CPU_FTR_ARCH_31) && + old_thread->dexcr != new_thread->dexcr) + mtspr(SPRN_DEXCR, new_thread->dexcr); #endif } @@ -1878,6 +1885,9 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) #ifdef CONFIG_PPC_BOOK3S_64 if (cpu_has_feature(CPU_FTR_DEXCR_NPHIE)) p->thread.hashkeyr = current->thread.hashkeyr; + + if (cpu_has_feature(CPU_FTR_ARCH_31)) + p->thread.dexcr = mfspr(SPRN_DEXCR); #endif return 0; } diff --git a/arch/powerpc/kernel/ptrace/ptrace-view.c b/arch/powerpc/kernel/ptrace/ptrace-view.c index 584cf5c3df50..c1819e0a6684 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-view.c +++ b/arch/powerpc/kernel/ptrace/ptrace-view.c @@ -469,12 +469,7 @@ static int dexcr_get(struct task_struct *target, const struct user_regset *regse if (!cpu_has_feature(CPU_FTR_ARCH_31)) return -ENODEV; - /* - * The DEXCR is currently static across all CPUs, so we don't - * store the target's value anywhere, but the static value - * will also be correct. - */ - membuf_store(&to, (u64)lower_32_bits(DEXCR_INIT)); + membuf_store(&to, (u64)lower_32_bits(target->thread.dexcr)); /* * Technically the HDEXCR is per-cpu, but a hypervisor can't reasonably From bbd99922d0f4518518282217159666c679c6a0d1 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:19 +1000 Subject: [PATCH 41/85] powerpc/dexcr: Reset DEXCR value across exec Inheriting the DEXCR across exec can have security and usability concerns. If a program is compiled with hash instructions it generally expects to run with NPHIE enabled. But if the parent process disables NPHIE then if it's not careful it will be disabled for any children too and the protection offered by hash checks is basically worthless. This patch introduces a per-process reset value that new execs in a particular process tree are initialized with. This enables fine grained control over what DEXCR value child processes run with by default. For example, containers running legacy binaries that expect hash instructions to act as NOPs could configure the reset value of the container root to control the default reset value for all members of the container. Signed-off-by: Benjamin Gray [mpe: Add missing SPDX tag on dexcr.c] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-4-bgray@linux.ibm.com --- arch/powerpc/include/asm/processor.h | 2 +- arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/dexcr.c | 23 +++++++++++++++++++++++ arch/powerpc/kernel/process.c | 7 +++++++ 4 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/dexcr.c diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 882e31296ea6..aad85a24134a 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -261,7 +261,7 @@ struct thread_struct { unsigned long sier3; unsigned long hashkeyr; unsigned long dexcr; - + unsigned long dexcr_onexec; /* Reset value to load on exec */ #endif }; diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index d3282fbea4f2..1d183b077948 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_DAWR) += dawr.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o +obj-$(CONFIG_PPC_BOOK3S_64) += dexcr.o obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_64e.o obj-$(CONFIG_PPC_BARRIER_NOSPEC) += security.o diff --git a/arch/powerpc/kernel/dexcr.c b/arch/powerpc/kernel/dexcr.c new file mode 100644 index 000000000000..d5cd77421088 --- /dev/null +++ b/arch/powerpc/kernel/dexcr.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int __init init_task_dexcr(void) +{ + if (!early_cpu_has_feature(CPU_FTR_ARCH_31)) + return 0; + + current->thread.dexcr_onexec = mfspr(SPRN_DEXCR); + + return 0; +} +early_initcall(init_task_dexcr) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index d482c3fd81d7..8ab779a3bdde 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1641,6 +1641,13 @@ void arch_setup_new_exec(void) current->thread.regs->amr = default_amr; current->thread.regs->iamr = default_iamr; #endif + +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_ARCH_31)) { + current->thread.dexcr = current->thread.dexcr_onexec; + mtspr(SPRN_DEXCR, current->thread.dexcr); + } +#endif /* CONFIG_PPC_BOOK3S_64 */ } #ifdef CONFIG_PPC64 From 628d701f2de5b9a16d1dd82bea68fd895f56f1a1 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:20 +1000 Subject: [PATCH 42/85] powerpc/dexcr: Add DEXCR prctl interface Now that we track a DEXCR on a per-task basis, individual tasks are free to configure it as they like. The interface is a pair of getter/setter prctl's that work on a single aspect at a time (multiple aspects at once is more difficult if there are different rules applied for each aspect, now or in future). The getter shows the current state of the process config, and the setter allows setting/clearing the aspect. Signed-off-by: Benjamin Gray [mpe: Account for PR_RISCV_SET_ICACHE_FLUSH_CTX, shrink some longs lines] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-5-bgray@linux.ibm.com --- arch/powerpc/include/asm/processor.h | 10 +++ arch/powerpc/kernel/dexcr.c | 101 +++++++++++++++++++++++++++ include/uapi/linux/prctl.h | 16 +++++ kernel/sys.c | 16 +++++ 4 files changed, 143 insertions(+) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index aad85a24134a..e44cac0da346 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -334,6 +334,16 @@ extern int set_endian(struct task_struct *tsk, unsigned int val); extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr); extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val); +#ifdef CONFIG_PPC_BOOK3S_64 + +#define PPC_GET_DEXCR_ASPECT(tsk, asp) get_dexcr_prctl((tsk), (asp)) +#define PPC_SET_DEXCR_ASPECT(tsk, asp, val) set_dexcr_prctl((tsk), (asp), (val)) + +int get_dexcr_prctl(struct task_struct *tsk, unsigned long asp); +int set_dexcr_prctl(struct task_struct *tsk, unsigned long asp, unsigned long val); + +#endif + extern void load_fp_state(struct thread_fp_state *fp); extern void store_fp_state(struct thread_fp_state *fp); extern void load_vr_state(struct thread_vr_state *vr); diff --git a/arch/powerpc/kernel/dexcr.c b/arch/powerpc/kernel/dexcr.c index d5cd77421088..3a0358e91c60 100644 --- a/arch/powerpc/kernel/dexcr.c +++ b/arch/powerpc/kernel/dexcr.c @@ -21,3 +21,104 @@ static int __init init_task_dexcr(void) return 0; } early_initcall(init_task_dexcr) + +/* Allow thread local configuration of these by default */ +#define DEXCR_PRCTL_EDITABLE ( \ + DEXCR_PR_IBRTPD | \ + DEXCR_PR_SRAPD | \ + DEXCR_PR_NPHIE) + +static int prctl_to_aspect(unsigned long which, unsigned int *aspect) +{ + switch (which) { + case PR_PPC_DEXCR_SBHE: + *aspect = DEXCR_PR_SBHE; + break; + case PR_PPC_DEXCR_IBRTPD: + *aspect = DEXCR_PR_IBRTPD; + break; + case PR_PPC_DEXCR_SRAPD: + *aspect = DEXCR_PR_SRAPD; + break; + case PR_PPC_DEXCR_NPHIE: + *aspect = DEXCR_PR_NPHIE; + break; + default: + return -ENODEV; + } + + return 0; +} + +int get_dexcr_prctl(struct task_struct *task, unsigned long which) +{ + unsigned int aspect; + int ret; + + ret = prctl_to_aspect(which, &aspect); + if (ret) + return ret; + + if (aspect & DEXCR_PRCTL_EDITABLE) + ret |= PR_PPC_DEXCR_CTRL_EDITABLE; + + if (aspect & mfspr(SPRN_DEXCR)) + ret |= PR_PPC_DEXCR_CTRL_SET; + else + ret |= PR_PPC_DEXCR_CTRL_CLEAR; + + if (aspect & task->thread.dexcr_onexec) + ret |= PR_PPC_DEXCR_CTRL_SET_ONEXEC; + else + ret |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC; + + return ret; +} + +int set_dexcr_prctl(struct task_struct *task, unsigned long which, unsigned long ctrl) +{ + unsigned long dexcr; + unsigned int aspect; + int err = 0; + + err = prctl_to_aspect(which, &aspect); + if (err) + return err; + + if (!(aspect & DEXCR_PRCTL_EDITABLE)) + return -EPERM; + + if (ctrl & ~PR_PPC_DEXCR_CTRL_MASK) + return -EINVAL; + + if (ctrl & PR_PPC_DEXCR_CTRL_SET && ctrl & PR_PPC_DEXCR_CTRL_CLEAR) + return -EINVAL; + + if (ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC && ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC) + return -EINVAL; + + /* + * We do not want an unprivileged process being able to disable + * a setuid process's hash check instructions + */ + if (aspect == DEXCR_PR_NPHIE && + ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + dexcr = mfspr(SPRN_DEXCR); + + if (ctrl & PR_PPC_DEXCR_CTRL_SET) + dexcr |= aspect; + else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR) + dexcr &= ~aspect; + + if (ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC) + task->thread.dexcr_onexec |= aspect; + else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC) + task->thread.dexcr_onexec &= ~aspect; + + mtspr(SPRN_DEXCR, dexcr); + + return 0; +} diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 370ed14b1ae0..713d28788df7 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -306,4 +306,20 @@ struct prctl_mm_map { # define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc # define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f +/* PowerPC Dynamic Execution Control Register (DEXCR) controls */ +#define PR_PPC_GET_DEXCR 72 +#define PR_PPC_SET_DEXCR 73 +/* DEXCR aspect to act on */ +# define PR_PPC_DEXCR_SBHE 0 /* Speculative branch hint enable */ +# define PR_PPC_DEXCR_IBRTPD 1 /* Indirect branch recurrent target prediction disable */ +# define PR_PPC_DEXCR_SRAPD 2 /* Subroutine return address prediction disable */ +# define PR_PPC_DEXCR_NPHIE 3 /* Non-privileged hash instruction enable */ +/* Action to apply / return */ +# define PR_PPC_DEXCR_CTRL_EDITABLE 0x1 /* Aspect can be modified with PR_PPC_SET_DEXCR */ +# define PR_PPC_DEXCR_CTRL_SET 0x2 /* Set the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_CLEAR 0x4 /* Clear the aspect for this process */ +# define PR_PPC_DEXCR_CTRL_SET_ONEXEC 0x8 /* Set the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */ +# define PR_PPC_DEXCR_CTRL_MASK 0x1f + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 8bb106a56b3a..f9c95410278c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -146,6 +146,12 @@ #ifndef RISCV_V_GET_CONTROL # define RISCV_V_GET_CONTROL() (-EINVAL) #endif +#ifndef PPC_GET_DEXCR_ASPECT +# define PPC_GET_DEXCR_ASPECT(a, b) (-EINVAL) +#endif +#ifndef PPC_SET_DEXCR_ASPECT +# define PPC_SET_DEXCR_ASPECT(a, b, c) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2726,6 +2732,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_MDWE: error = prctl_get_mdwe(arg2, arg3, arg4, arg5); break; + case PR_PPC_GET_DEXCR: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = PPC_GET_DEXCR_ASPECT(me, arg2); + break; + case PR_PPC_SET_DEXCR: + if (arg4 || arg5) + return -EINVAL; + error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3); + break; case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; From 5bfa66bf86d792bbcc76bc09cf99a2ae9d6e0eec Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:21 +1000 Subject: [PATCH 43/85] selftests/powerpc/dexcr: Add DEXCR prctl interface test Some basic tests of the prctl interface of the DEXCR. Signed-off-by: Benjamin Gray [mpe: Add missing SPDX tag] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-6-bgray@linux.ibm.com --- .../selftests/powerpc/dexcr/.gitignore | 1 + .../testing/selftests/powerpc/dexcr/Makefile | 4 +- tools/testing/selftests/powerpc/dexcr/dexcr.c | 40 ++++ tools/testing/selftests/powerpc/dexcr/dexcr.h | 10 + .../selftests/powerpc/dexcr/dexcr_test.c | 215 ++++++++++++++++++ 5 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/dexcr/dexcr_test.c diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index b82f45dd46b9..5d526613cd26 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1,2 +1,3 @@ +dexcr_test hashchk_test lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 1b775959f981..3b685b28f029 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,9 +1,11 @@ -TEST_GEN_PROGS := hashchk_test +TEST_GEN_PROGS := dexcr_test hashchk_test TEST_GEN_FILES := lsdexcr include ../../lib.mk include ../flags.mk +CFLAGS += $(KHDR_INCLUDES) + $(OUTPUT)/hashchk_test: CFLAGS += -fno-pie -no-pie $(call cc-option,-mno-rop-protect) $(TEST_GEN_PROGS): ../harness.c ../utils.c ./dexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.c b/tools/testing/selftests/powerpc/dexcr/dexcr.c index 65ec5347de98..468fd0dc9912 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,45 @@ out: return exists; } +unsigned int pr_which_to_aspect(unsigned long which) +{ + switch (which) { + case PR_PPC_DEXCR_SBHE: + return DEXCR_PR_SBHE; + case PR_PPC_DEXCR_IBRTPD: + return DEXCR_PR_IBRTPD; + case PR_PPC_DEXCR_SRAPD: + return DEXCR_PR_SRAPD; + case PR_PPC_DEXCR_NPHIE: + return DEXCR_PR_NPHIE; + default: + FAIL_IF_EXIT_MSG(true, "unknown PR aspect"); + } +} + +int pr_get_dexcr(unsigned long which) +{ + return prctl(PR_PPC_GET_DEXCR, which, 0UL, 0UL, 0UL); +} + +int pr_set_dexcr(unsigned long which, unsigned long ctrl) +{ + return prctl(PR_PPC_SET_DEXCR, which, ctrl, 0UL, 0UL); +} + +bool pr_dexcr_aspect_supported(unsigned long which) +{ + if (pr_get_dexcr(which) == -1) + return errno == ENODEV; + + return true; +} + +bool pr_dexcr_aspect_editable(unsigned long which) +{ + return pr_get_dexcr(which) & PR_PPC_DEXCR_CTRL_EDITABLE; +} + /* * Just test if a bad hashchk triggers a signal, without checking * for support or if the NPHIE aspect is enabled. diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.h b/tools/testing/selftests/powerpc/dexcr/dexcr.h index f55cbbc8643b..a6aa7eac11da 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.h +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.h @@ -28,6 +28,16 @@ bool dexcr_exists(void); +bool pr_dexcr_aspect_supported(unsigned long which); + +bool pr_dexcr_aspect_editable(unsigned long which); + +int pr_get_dexcr(unsigned long pr_aspect); + +int pr_set_dexcr(unsigned long pr_aspect, unsigned long ctrl); + +unsigned int pr_which_to_aspect(unsigned long which); + bool hashchk_triggers(void); enum dexcr_source { diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr_test.c b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c new file mode 100644 index 000000000000..7a8657164908 --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/dexcr_test.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include + +#include "dexcr.h" +#include "utils.h" + +/* + * Helper function for testing the behaviour of a newly exec-ed process + */ +static int dexcr_prctl_onexec_test_child(unsigned long which, const char *status) +{ + unsigned long dexcr = mfspr(SPRN_DEXCR_RO); + unsigned long aspect = pr_which_to_aspect(which); + int ctrl = pr_get_dexcr(which); + + if (!strcmp(status, "set")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), + "setting aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(!(aspect & dexcr), "setting aspect across exec did not take effect"); + } else if (!strcmp(status, "clear")) { + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), + "clearing aspect across exec not applied"); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect across exec not inherited"); + + FAIL_IF_EXIT_MSG(aspect & dexcr, "clearing aspect across exec did not take effect"); + } else { + FAIL_IF_EXIT_MSG(true, "unknown expected status"); + } + + return 0; +} + +/* + * Test that the given prctl value can be manipulated freely + */ +static int dexcr_prctl_aspect_test(unsigned long which) +{ + unsigned long aspect = pr_which_to_aspect(which); + pid_t pid; + int ctrl; + int err; + int errno_save; + + SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_supported(which), "DEXCR aspect not supported"); + SKIP_IF_MSG(!pr_dexcr_aspect_editable(which), "DEXCR aspect not editable with prctl"); + + /* We reject invalid combinations of arguments */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear should be rejected with EINVAL"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + errno_save = errno; + FAIL_IF_MSG(err != -1, "simultaneous set and clear on exec should be rejected"); + FAIL_IF_MSG(errno_save != EINVAL, "simultaneous set and clear on exec should be rejected with EINVAL"); + + /* We set the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR, "config value unexpected clear flag"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "setting aspect did not take effect"); + + /* We clear the aspect */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET, "config value unexpected set flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "clearing aspect did not take effect"); + + /* We make it set on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC, "config value unexpected clear on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "scheduling aspect to set on exec should not change it now"); + + /* We make it clear on exec (doesn't change our current value) */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "process aspect config should still be cleared"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC, "config value unexpected set on exec flag"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should still be cleared"); + + /* We allow setting the current and on-exec value in a single call */ + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET), "config value not PR_PPC_DEXCR_CTRL_SET"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC"); + FAIL_IF_MSG(!(aspect & mfspr(SPRN_DEXCR_RO)), "process aspect should be set"); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_CLEAR | PR_PPC_DEXCR_CTRL_SET_ONEXEC failed"); + + ctrl = pr_get_dexcr(which); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR), "config value not PR_PPC_DEXCR_CTRL_CLEAR"); + FAIL_IF_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), "config value not PR_PPC_DEXCR_CTRL_SET_ONEXEC"); + FAIL_IF_MSG(aspect & mfspr(SPRN_DEXCR_RO), "process aspect should be clear"); + + /* Verify the onexec value is applied across exec */ + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "set", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_SET_ONEXEC), + "setting aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(mfspr(SPRN_DEXCR_RO) & aspect, + "setting aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + err = pr_set_dexcr(which, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC); + FAIL_IF_MSG(err, "PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC failed"); + + pid = fork(); + if (!pid) { + char which_str[32] = {}; + char *args[] = { "dexcr_prctl_onexec_test_child", which_str, "clear", NULL }; + unsigned int ctrl = pr_get_dexcr(which); + + sprintf(which_str, "%lu", which); + + FAIL_IF_EXIT_MSG(!(ctrl & PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC), + "clearing aspect on exec not copied across fork"); + + FAIL_IF_EXIT_MSG(!(mfspr(SPRN_DEXCR_RO) & aspect), + "clearing aspect on exec wrongly applied to fork"); + + execve("/proc/self/exe", args, NULL); + _exit(errno); + } + await_child_success(pid); + + return 0; +} + +static int dexcr_prctl_ibrtpd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_IBRTPD); +} + +static int dexcr_prctl_srapd_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_SRAPD); +} + +static int dexcr_prctl_nphie_test(void) +{ + return dexcr_prctl_aspect_test(PR_PPC_DEXCR_NPHIE); +} + +int main(int argc, char *argv[]) +{ + int err = 0; + + /* + * Some tests require checking what happens across exec, so we may be + * invoked as the child of a particular test + */ + if (argc > 1) { + if (argc == 3 && !strcmp(argv[0], "dexcr_prctl_onexec_test_child")) { + unsigned long which; + + err = parse_ulong(argv[1], strlen(argv[1]), &which, 10); + FAIL_IF_MSG(err, "failed to parse which value for child"); + + return dexcr_prctl_onexec_test_child(which, argv[2]); + } + + FAIL_IF_MSG(true, "unknown test case"); + } + + /* + * Otherwise we are the main test invocation and run the full suite + */ + err |= test_harness(dexcr_prctl_ibrtpd_test, "dexcr_prctl_ibrtpd"); + err |= test_harness(dexcr_prctl_srapd_test, "dexcr_prctl_srapd"); + err |= test_harness(dexcr_prctl_nphie_test, "dexcr_prctl_nphie"); + + return err; +} From 9930fba02a1c587849aea1e6c5688168013c065f Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:22 +1000 Subject: [PATCH 44/85] selftests/powerpc/dexcr: Attempt to enable NPHIE in hashchk selftest Now that a process can control its DEXCR to some extent, make the hashchk tests more reliable by explicitly setting the local and onexec NPHIE aspect. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-7-bgray@linux.ibm.com --- tools/testing/selftests/powerpc/dexcr/hashchk_test.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c index 7d5658c9ebe4..645224bdc142 100644 --- a/tools/testing/selftests/powerpc/dexcr/hashchk_test.c +++ b/tools/testing/selftests/powerpc/dexcr/hashchk_test.c @@ -21,8 +21,14 @@ static int require_nphie(void) { SKIP_IF_MSG(!dexcr_exists(), "DEXCR not supported"); + + pr_set_dexcr(PR_PPC_DEXCR_NPHIE, PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_SET_ONEXEC); + + if (get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE) + return 0; + SKIP_IF_MSG(!(get_dexcr(EFFECTIVE) & DEXCR_PR_NPHIE), - "DEXCR[NPHIE] not enabled"); + "Failed to enable DEXCR[NPHIE]"); return 0; } From 9c4866b209ad31cae7c832d45c6137ce6a993ca0 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:23 +1000 Subject: [PATCH 45/85] selftests/powerpc/dexcr: Add DEXCR config details to lsdexcr Now that the DEXCR can be configured with prctl, add a section in lsdexcr that explains why each aspect is set the way it is. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-8-bgray@linux.ibm.com --- .../testing/selftests/powerpc/dexcr/lsdexcr.c | 113 +++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c index 94abbfcc389e..a63db47b6610 100644 --- a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -1,9 +1,9 @@ // SPDX-License-Identifier: GPL-2.0+ -#include #include #include #include +#include #include "dexcr.h" #include "utils.h" @@ -16,6 +16,8 @@ struct dexcr_aspect { const char *name; const char *desc; unsigned int index; + unsigned long prctl; + const char *sysctl; }; static const struct dexcr_aspect aspects[] = { @@ -23,26 +25,36 @@ static const struct dexcr_aspect aspects[] = { .name = "SBHE", .desc = "Speculative branch hint enable", .index = 0, + .prctl = PR_PPC_DEXCR_SBHE, + .sysctl = "speculative_branch_hint_enable", }, { .name = "IBRTPD", .desc = "Indirect branch recurrent target prediction disable", .index = 3, + .prctl = PR_PPC_DEXCR_IBRTPD, + .sysctl = "indirect_branch_recurrent_target_prediction_disable", }, { .name = "SRAPD", .desc = "Subroutine return address prediction disable", .index = 4, + .prctl = PR_PPC_DEXCR_SRAPD, + .sysctl = "subroutine_return_address_prediction_disable", }, { .name = "NPHIE", .desc = "Non-privileged hash instruction enable", .index = 5, + .prctl = PR_PPC_DEXCR_NPHIE, + .sysctl = "nonprivileged_hash_instruction_enable", }, { .name = "PHIE", .desc = "Privileged hash instruction enable", .index = 6, + .prctl = -1, + .sysctl = NULL, }, }; @@ -60,7 +72,7 @@ static void print_dexcr(char *name, unsigned int bits) const char *enabled_aspects[ARRAY_SIZE(aspects) + 1] = {NULL}; size_t j = 0; - printf("%s: %08x", name, bits); + printf("%s: 0x%08x", name, bits); if (bits == 0) { printf("\n"); @@ -103,6 +115,95 @@ static void print_aspect(const struct dexcr_aspect *aspect) printf(" \t(%s)\n", aspect->desc); } +static void print_aspect_config(const struct dexcr_aspect *aspect) +{ + char sysctl_path[128] = "/proc/sys/kernel/dexcr/"; + const char *reason = "unknown"; + const char *reason_hyp = NULL; + const char *reason_sysctl = "no sysctl"; + const char *reason_prctl = "no prctl"; + bool actual = effective & DEXCR_PR_BIT(aspect->index); + bool expected = false; + + long sysctl_ctrl = 0; + int prctl_ctrl = 0; + int err; + + if (aspect->prctl >= 0) { + prctl_ctrl = pr_get_dexcr(aspect->prctl); + if (prctl_ctrl < 0) + reason_prctl = "(failed to read prctl)"; + else { + if (prctl_ctrl & PR_PPC_DEXCR_CTRL_SET) { + reason_prctl = "set by prctl"; + expected = true; + } else if (prctl_ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { + reason_prctl = "cleared by prctl"; + expected = false; + } else + reason_prctl = "unknown prctl"; + + reason = reason_prctl; + } + } + + if (aspect->sysctl) { + strcat(sysctl_path, aspect->sysctl); + err = read_long(sysctl_path, &sysctl_ctrl, 10); + if (err) + reason_sysctl = "(failed to read sysctl)"; + else { + switch (sysctl_ctrl) { + case 0: + reason_sysctl = "cleared by sysctl"; + reason = reason_sysctl; + expected = false; + break; + case 1: + reason_sysctl = "set by sysctl"; + reason = reason_sysctl; + expected = true; + break; + case 2: + reason_sysctl = "not modified by sysctl"; + break; + case 3: + reason_sysctl = "cleared by sysctl (permanent)"; + reason = reason_sysctl; + expected = false; + break; + case 4: + reason_sysctl = "set by sysctl (permanent)"; + reason = reason_sysctl; + expected = true; + break; + default: + reason_sysctl = "unknown sysctl"; + break; + } + } + } + + + if (hdexcr & DEXCR_PR_BIT(aspect->index)) { + reason_hyp = "set by hypervisor"; + reason = reason_hyp; + expected = true; + } else + reason_hyp = "not modified by hypervisor"; + + printf("%12s (%d): %-28s (%s, %s, %s)\n", + aspect->name, + aspect->index, + reason, + reason_hyp, + reason_sysctl, + reason_prctl); + + if (actual != expected) + printf(" : ! actual %s does not match config\n", aspect->name); +} + int main(int argc, char *argv[]) { if (!dexcr_exists()) { @@ -114,6 +215,8 @@ int main(int argc, char *argv[]) hdexcr = get_dexcr(HDEXCR); effective = dexcr | hdexcr; + printf("current status:\n"); + print_dexcr(" DEXCR", dexcr); print_dexcr(" HDEXCR", hdexcr); print_dexcr("Effective", effective); @@ -136,6 +239,12 @@ int main(int argc, char *argv[]) else printf("ignored\n"); } + printf("\n"); + + printf("configuration:\n"); + for (size_t i = 0; i < ARRAY_SIZE(aspects); i++) + print_aspect_config(&aspects[i]); + printf("\n"); return 0; } From f88723a609787254f7645eb6ac261b8363e8a5bc Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:24 +1000 Subject: [PATCH 46/85] selftests/powerpc/dexcr: Add chdexcr utility Adds a utility to exercise the prctl DEXCR inheritance in the shell. Supports setting and clearing each aspect. Signed-off-by: Benjamin Gray [mpe: Use correct SPDX license, use execvp() for usability, print errors] Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-9-bgray@linux.ibm.com --- .../selftests/powerpc/dexcr/.gitignore | 1 + .../testing/selftests/powerpc/dexcr/Makefile | 2 +- .../testing/selftests/powerpc/dexcr/chdexcr.c | 112 ++++++++++++++++ tools/testing/selftests/powerpc/dexcr/dexcr.h | 47 +++++++ .../testing/selftests/powerpc/dexcr/lsdexcr.c | 126 ++++-------------- 5 files changed, 185 insertions(+), 103 deletions(-) create mode 100644 tools/testing/selftests/powerpc/dexcr/chdexcr.c diff --git a/tools/testing/selftests/powerpc/dexcr/.gitignore b/tools/testing/selftests/powerpc/dexcr/.gitignore index 5d526613cd26..11eefb4b9fa4 100644 --- a/tools/testing/selftests/powerpc/dexcr/.gitignore +++ b/tools/testing/selftests/powerpc/dexcr/.gitignore @@ -1,3 +1,4 @@ dexcr_test hashchk_test +chdexcr lsdexcr diff --git a/tools/testing/selftests/powerpc/dexcr/Makefile b/tools/testing/selftests/powerpc/dexcr/Makefile index 3b685b28f029..58cf9f722905 100644 --- a/tools/testing/selftests/powerpc/dexcr/Makefile +++ b/tools/testing/selftests/powerpc/dexcr/Makefile @@ -1,5 +1,5 @@ TEST_GEN_PROGS := dexcr_test hashchk_test -TEST_GEN_FILES := lsdexcr +TEST_GEN_FILES := lsdexcr chdexcr include ../../lib.mk include ../flags.mk diff --git a/tools/testing/selftests/powerpc/dexcr/chdexcr.c b/tools/testing/selftests/powerpc/dexcr/chdexcr.c new file mode 100644 index 000000000000..bda44630cada --- /dev/null +++ b/tools/testing/selftests/powerpc/dexcr/chdexcr.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include +#include + +#include "dexcr.h" +#include "utils.h" + +static void die(const char *msg) +{ + printf("%s\n", msg); + exit(1); +} + +static void help(void) +{ + printf("Invoke a provided program with a custom DEXCR on-exec reset value\n" + "\n" + "usage: chdexcr [CHDEXCR OPTIONS] -- PROGRAM [ARGS...]\n" + "\n" + "Each configurable DEXCR aspect is exposed as an option.\n" + "\n" + "The normal option sets the aspect in the DEXCR. The --no- variant\n" + "clears that aspect. For example, --ibrtpd sets the IBRTPD aspect bit,\n" + "so indirect branch predicition will be disabled in the provided program.\n" + "Conversely, --no-ibrtpd clears the aspect bit, so indirect branch\n" + "prediction may occur.\n" + "\n" + "CHDEXCR OPTIONS:\n"); + + for (int i = 0; i < ARRAY_SIZE(aspects); i++) { + const struct dexcr_aspect *aspect = &aspects[i]; + + if (aspect->prctl == -1) + continue; + + printf(" --%-6s / --no-%-6s : %s\n", aspect->opt, aspect->opt, aspect->desc); + } +} + +static const struct dexcr_aspect *opt_to_aspect(const char *opt) +{ + for (int i = 0; i < ARRAY_SIZE(aspects); i++) + if (aspects[i].prctl != -1 && !strcmp(aspects[i].opt, opt)) + return &aspects[i]; + + return NULL; +} + +static int apply_option(const char *option) +{ + const struct dexcr_aspect *aspect; + const char *opt = NULL; + const char *set_prefix = "--"; + const char *clear_prefix = "--no-"; + unsigned long ctrl = 0; + int err; + + if (!strcmp(option, "-h") || !strcmp(option, "--help")) { + help(); + exit(0); + } + + /* Strip out --(no-) prefix and determine ctrl value */ + if (!strncmp(option, clear_prefix, strlen(clear_prefix))) { + opt = &option[strlen(clear_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC; + } else if (!strncmp(option, set_prefix, strlen(set_prefix))) { + opt = &option[strlen(set_prefix)]; + ctrl |= PR_PPC_DEXCR_CTRL_SET_ONEXEC; + } + + if (!opt || !*opt) + return 1; + + aspect = opt_to_aspect(opt); + if (!aspect) + die("unknown aspect"); + + err = pr_set_dexcr(aspect->prctl, ctrl); + if (err) + die("failed to apply option"); + + return 0; +} + +int main(int argc, char *const argv[]) +{ + int i; + + if (!dexcr_exists()) + die("DEXCR not detected on this hardware"); + + for (i = 1; i < argc; i++) + if (apply_option(argv[i])) + break; + + if (i < argc && !strcmp(argv[i], "--")) + i++; + + if (i >= argc) + die("missing command"); + + execvp(argv[i], &argv[i]); + perror("execve"); + + return errno; +} diff --git a/tools/testing/selftests/powerpc/dexcr/dexcr.h b/tools/testing/selftests/powerpc/dexcr/dexcr.h index a6aa7eac11da..51e9ba3b0997 100644 --- a/tools/testing/selftests/powerpc/dexcr/dexcr.h +++ b/tools/testing/selftests/powerpc/dexcr/dexcr.h @@ -9,6 +9,7 @@ #define _SELFTESTS_POWERPC_DEXCR_DEXCR_H #include +#include #include #include "reg.h" @@ -26,6 +27,52 @@ #define PPC_RAW_HASHCHK(b, i, a) \ str(.long (0x7C0005E4 | PPC_RAW_HASH_ARGS(b, i, a));) +struct dexcr_aspect { + const char *name; /* Short display name */ + const char *opt; /* Option name for chdexcr */ + const char *desc; /* Expanded aspect meaning */ + unsigned int index; /* Aspect bit index in DEXCR */ + unsigned long prctl; /* 'which' value for get/set prctl */ +}; + +static const struct dexcr_aspect aspects[] = { + { + .name = "SBHE", + .opt = "sbhe", + .desc = "Speculative branch hint enable", + .index = 0, + .prctl = PR_PPC_DEXCR_SBHE, + }, + { + .name = "IBRTPD", + .opt = "ibrtpd", + .desc = "Indirect branch recurrent target prediction disable", + .index = 3, + .prctl = PR_PPC_DEXCR_IBRTPD, + }, + { + .name = "SRAPD", + .opt = "srapd", + .desc = "Subroutine return address prediction disable", + .index = 4, + .prctl = PR_PPC_DEXCR_SRAPD, + }, + { + .name = "NPHIE", + .opt = "nphie", + .desc = "Non-privileged hash instruction enable", + .index = 5, + .prctl = PR_PPC_DEXCR_NPHIE, + }, + { + .name = "PHIE", + .opt = "phie", + .desc = "Privileged hash instruction enable", + .index = 6, + .prctl = -1, + }, +}; + bool dexcr_exists(void); bool pr_dexcr_aspect_supported(unsigned long which); diff --git a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c index a63db47b6610..7588929180ab 100644 --- a/tools/testing/selftests/powerpc/dexcr/lsdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/lsdexcr.c @@ -12,52 +12,6 @@ static unsigned int dexcr; static unsigned int hdexcr; static unsigned int effective; -struct dexcr_aspect { - const char *name; - const char *desc; - unsigned int index; - unsigned long prctl; - const char *sysctl; -}; - -static const struct dexcr_aspect aspects[] = { - { - .name = "SBHE", - .desc = "Speculative branch hint enable", - .index = 0, - .prctl = PR_PPC_DEXCR_SBHE, - .sysctl = "speculative_branch_hint_enable", - }, - { - .name = "IBRTPD", - .desc = "Indirect branch recurrent target prediction disable", - .index = 3, - .prctl = PR_PPC_DEXCR_IBRTPD, - .sysctl = "indirect_branch_recurrent_target_prediction_disable", - }, - { - .name = "SRAPD", - .desc = "Subroutine return address prediction disable", - .index = 4, - .prctl = PR_PPC_DEXCR_SRAPD, - .sysctl = "subroutine_return_address_prediction_disable", - }, - { - .name = "NPHIE", - .desc = "Non-privileged hash instruction enable", - .index = 5, - .prctl = PR_PPC_DEXCR_NPHIE, - .sysctl = "nonprivileged_hash_instruction_enable", - }, - { - .name = "PHIE", - .desc = "Privileged hash instruction enable", - .index = 6, - .prctl = -1, - .sysctl = NULL, - }, -}; - static void print_list(const char *list[], size_t len) { for (size_t i = 0; i < len; i++) { @@ -117,89 +71,57 @@ static void print_aspect(const struct dexcr_aspect *aspect) static void print_aspect_config(const struct dexcr_aspect *aspect) { - char sysctl_path[128] = "/proc/sys/kernel/dexcr/"; - const char *reason = "unknown"; + const char *reason = NULL; const char *reason_hyp = NULL; - const char *reason_sysctl = "no sysctl"; const char *reason_prctl = "no prctl"; bool actual = effective & DEXCR_PR_BIT(aspect->index); - bool expected = false; + bool expected = actual; /* Assume it's fine if we don't expect a specific set/clear value */ - long sysctl_ctrl = 0; - int prctl_ctrl = 0; - int err; + if (actual) + reason = "set by unknown"; + else + reason = "cleared by unknown"; - if (aspect->prctl >= 0) { - prctl_ctrl = pr_get_dexcr(aspect->prctl); - if (prctl_ctrl < 0) - reason_prctl = "(failed to read prctl)"; - else { - if (prctl_ctrl & PR_PPC_DEXCR_CTRL_SET) { + if (aspect->prctl != -1) { + int ctrl = pr_get_dexcr(aspect->prctl); + + if (ctrl < 0) { + reason_prctl = "failed to read prctl"; + } else { + if (ctrl & PR_PPC_DEXCR_CTRL_SET) { reason_prctl = "set by prctl"; expected = true; - } else if (prctl_ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { + } else if (ctrl & PR_PPC_DEXCR_CTRL_CLEAR) { reason_prctl = "cleared by prctl"; expected = false; - } else + } else { reason_prctl = "unknown prctl"; + } reason = reason_prctl; } } - if (aspect->sysctl) { - strcat(sysctl_path, aspect->sysctl); - err = read_long(sysctl_path, &sysctl_ctrl, 10); - if (err) - reason_sysctl = "(failed to read sysctl)"; - else { - switch (sysctl_ctrl) { - case 0: - reason_sysctl = "cleared by sysctl"; - reason = reason_sysctl; - expected = false; - break; - case 1: - reason_sysctl = "set by sysctl"; - reason = reason_sysctl; - expected = true; - break; - case 2: - reason_sysctl = "not modified by sysctl"; - break; - case 3: - reason_sysctl = "cleared by sysctl (permanent)"; - reason = reason_sysctl; - expected = false; - break; - case 4: - reason_sysctl = "set by sysctl (permanent)"; - reason = reason_sysctl; - expected = true; - break; - default: - reason_sysctl = "unknown sysctl"; - break; - } - } - } - - if (hdexcr & DEXCR_PR_BIT(aspect->index)) { reason_hyp = "set by hypervisor"; reason = reason_hyp; expected = true; - } else + } else { reason_hyp = "not modified by hypervisor"; + } - printf("%12s (%d): %-28s (%s, %s, %s)\n", + printf("%12s (%d): %-28s (%s, %s)\n", aspect->name, aspect->index, reason, reason_hyp, - reason_sysctl, reason_prctl); + /* + * The checks are not atomic, so this can technically trigger if the + * hypervisor makes a change while we are checking each source. It's + * far more likely to be a bug if we see this though. + */ if (actual != expected) printf(" : ! actual %s does not match config\n", aspect->name); } From 9248edf31ab28723fb00900ecb8bacdb05eeefff Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 17 Apr 2024 21:23:25 +1000 Subject: [PATCH 47/85] Documentation: Document PowerPC kernel dynamic DEXCR interface Documents how to use the PR_PPC_GET_DEXCR and PR_PPC_SET_DEXCR prctl()'s for changing a process's DEXCR or its process tree default value. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240417112325.728010-10-bgray@linux.ibm.com --- Documentation/arch/powerpc/dexcr.rst | 141 ++++++++++++++++++++++++++- 1 file changed, 139 insertions(+), 2 deletions(-) diff --git a/Documentation/arch/powerpc/dexcr.rst b/Documentation/arch/powerpc/dexcr.rst index 615a631f51fa..ab0724212fcd 100644 --- a/Documentation/arch/powerpc/dexcr.rst +++ b/Documentation/arch/powerpc/dexcr.rst @@ -36,8 +36,145 @@ state for a process. Configuration ============= -The DEXCR is currently unconfigurable. All threads are run with the -NPHIE aspect enabled. +prctl +----- + +A process can control its own userspace DEXCR value using the +``PR_PPC_GET_DEXCR`` and ``PR_PPC_SET_DEXCR`` pair of +:manpage:`prctl(2)` commands. These calls have the form:: + + prctl(PR_PPC_GET_DEXCR, unsigned long which, 0, 0, 0); + prctl(PR_PPC_SET_DEXCR, unsigned long which, unsigned long ctrl, 0, 0); + +The possible 'which' and 'ctrl' values are as follows. Note there is no relation +between the 'which' value and the DEXCR aspect's index. + +.. flat-table:: + :header-rows: 1 + :widths: 2 7 1 + + * - ``prctl()`` which + - Aspect name + - Aspect index + + * - ``PR_PPC_DEXCR_SBHE`` + - Speculative Branch Hint Enable (SBHE) + - 0 + + * - ``PR_PPC_DEXCR_IBRTPD`` + - Indirect Branch Recurrent Target Prediction Disable (IBRTPD) + - 3 + + * - ``PR_PPC_DEXCR_SRAPD`` + - Subroutine Return Address Prediction Disable (SRAPD) + - 4 + + * - ``PR_PPC_DEXCR_NPHIE`` + - Non-Privileged Hash Instruction Enable (NPHIE) + - 5 + +.. flat-table:: + :header-rows: 1 + :widths: 2 8 + + * - ``prctl()`` ctrl + - Meaning + + * - ``PR_PPC_DEXCR_CTRL_EDITABLE`` + - This aspect can be configured with PR_PPC_SET_DEXCR (get only) + + * - ``PR_PPC_DEXCR_CTRL_SET`` + - This aspect is set / set this aspect + + * - ``PR_PPC_DEXCR_CTRL_CLEAR`` + - This aspect is clear / clear this aspect + + * - ``PR_PPC_DEXCR_CTRL_SET_ONEXEC`` + - This aspect will be set after exec / set this aspect after exec + + * - ``PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC`` + - This aspect will be clear after exec / clear this aspect after exec + +Note that + +* which is a plain value, not a bitmask. Aspects must be worked with individually. + +* ctrl is a bitmask. ``PR_PPC_GET_DEXCR`` returns both the current and onexec + configuration. For example, ``PR_PPC_GET_DEXCR`` may return + ``PR_PPC_DEXCR_CTRL_EDITABLE | PR_PPC_DEXCR_CTRL_SET | + PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC``. This would indicate the aspect is currently + set, it will be cleared when you run exec, and you can change this with the + ``PR_PPC_SET_DEXCR`` prctl. + +* The set/clear terminology refers to setting/clearing the bit in the DEXCR. + For example:: + + prctl(PR_PPC_SET_DEXCR, PR_PPC_DEXCR_IBRTPD, PR_PPC_DEXCR_CTRL_SET, 0, 0); + + will set the IBRTPD aspect bit in the DEXCR, causing indirect branch prediction + to be disabled. + +* The status returned by ``PR_PPC_GET_DEXCR`` represents what value the process + would like applied. It does not include any alternative overrides, such as if + the hypervisor is enforcing the aspect be set. To see the true DEXCR state + software should read the appropriate SPRs directly. + +* The aspect state when starting a process is copied from the parent's state on + :manpage:`fork(2)`. The state is reset to a fixed value on + :manpage:`execve(2)`. The PR_PPC_SET_DEXCR prctl() can control both of these + values. + +* The ``*_ONEXEC`` controls do not change the current process's DEXCR. + +Use ``PR_PPC_SET_DEXCR`` with one of ``PR_PPC_DEXCR_CTRL_SET`` or +``PR_PPC_DEXCR_CTRL_CLEAR`` to edit a given aspect. + +Common error codes for both getting and setting the DEXCR are as follows: + +.. flat-table:: + :header-rows: 1 + :widths: 2 8 + + * - Error + - Meaning + + * - ``EINVAL`` + - The DEXCR is not supported by the kernel. + + * - ``ENODEV`` + - The aspect is not recognised by the kernel or not supported by the + hardware. + +``PR_PPC_SET_DEXCR`` may also report the following error codes: + +.. flat-table:: + :header-rows: 1 + :widths: 2 8 + + * - Error + - Meaning + + * - ``EINVAL`` + - The ctrl value contains unrecognised flags. + + * - ``EINVAL`` + - The ctrl value contains mutually conflicting flags (e.g., + ``PR_PPC_DEXCR_CTRL_SET | PR_PPC_DEXCR_CTRL_CLEAR``) + + * - ``EPERM`` + - This aspect cannot be modified with prctl() (check for the + PR_PPC_DEXCR_CTRL_EDITABLE flag with PR_PPC_GET_DEXCR). + + * - ``EPERM`` + - The process does not have sufficient privilege to perform the operation. + For example, clearing NPHIE on exec is a privileged operation (a process + can still clear its own NPHIE aspect without privileges). + +This interface allows a process to control its own DEXCR aspects, and also set +the initial DEXCR value for any children in its process tree (up to the next +child to use an ``*_ONEXEC`` control). This allows fine-grained control over the +default value of the DEXCR, for example allowing containers to run with different +default values. coredump and ptrace From fae573060c8da4d84a2551c6753d272abfda8ddc Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 3 May 2024 12:10:12 +1000 Subject: [PATCH 48/85] Documentation: Fix the address of the linuxppc-dev mailing list This list was moved many years ago. Signed-off-by: Stephen Rothwell Signed-off-by: Michael Ellerman Link: https://msgid.link/20240503121012.3ba5000b@canb.auug.org.au --- Documentation/ABI/testing/sysfs-devices-system-cpu | 14 +++++++------- .../ABI/testing/sysfs-firmware-opal-powercap | 4 ++-- Documentation/ABI/testing/sysfs-firmware-opal-psr | 4 ++-- .../ABI/testing/sysfs-firmware-opal-sensor-groups | 4 ++-- .../testing/sysfs-firmware-papr-energy-scale-info | 10 +++++----- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 710d47be11e0..e7e160954e79 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -423,7 +423,7 @@ What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset Date: March 2016 Contact: Linux kernel mailing list - Linux for PowerPC mailing list + Linux for PowerPC mailing list Description: POWERNV CPUFreq driver's frequency throttle stats directory and attributes @@ -473,7 +473,7 @@ What: /sys/devices/system/cpu/cpufreq/policyX/throttle_stats /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/occ_reset Date: March 2016 Contact: Linux kernel mailing list - Linux for PowerPC mailing list + Linux for PowerPC mailing list Description: POWERNV CPUFreq driver's frequency throttle stats directory and attributes @@ -608,7 +608,7 @@ Description: Umwait control What: /sys/devices/system/cpu/svm Date: August 2019 Contact: Linux kernel mailing list - Linux for PowerPC mailing list + Linux for PowerPC mailing list Description: Secure Virtual Machine If 1, it means the system is using the Protected Execution @@ -617,7 +617,7 @@ Description: Secure Virtual Machine What: /sys/devices/system/cpu/cpuX/purr Date: Apr 2005 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: PURR ticks for this CPU since the system boot. The Processor Utilization Resources Register (PURR) is @@ -628,7 +628,7 @@ Description: PURR ticks for this CPU since the system boot. What: /sys/devices/system/cpu/cpuX/spurr Date: Dec 2006 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: SPURR ticks for this CPU since the system boot. The Scaled Processor Utilization Resources Register @@ -640,7 +640,7 @@ Description: SPURR ticks for this CPU since the system boot. What: /sys/devices/system/cpu/cpuX/idle_purr Date: Apr 2020 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: PURR ticks for cpuX when it was idle. This sysfs interface exposes the number of PURR ticks @@ -648,7 +648,7 @@ Description: PURR ticks for cpuX when it was idle. What: /sys/devices/system/cpu/cpuX/idle_spurr Date: Apr 2020 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: SPURR ticks for cpuX when it was idle. This sysfs interface exposes the number of SPURR ticks diff --git a/Documentation/ABI/testing/sysfs-firmware-opal-powercap b/Documentation/ABI/testing/sysfs-firmware-opal-powercap index c9b66ec4f165..d2d12ee89288 100644 --- a/Documentation/ABI/testing/sysfs-firmware-opal-powercap +++ b/Documentation/ABI/testing/sysfs-firmware-opal-powercap @@ -1,6 +1,6 @@ What: /sys/firmware/opal/powercap Date: August 2017 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: Powercap directory for Powernv (P8, P9) servers Each folder in this directory contains a @@ -11,7 +11,7 @@ What: /sys/firmware/opal/powercap/system-powercap /sys/firmware/opal/powercap/system-powercap/powercap-max /sys/firmware/opal/powercap/system-powercap/powercap-current Date: August 2017 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: System powercap directory and attributes applicable for Powernv (P8, P9) servers diff --git a/Documentation/ABI/testing/sysfs-firmware-opal-psr b/Documentation/ABI/testing/sysfs-firmware-opal-psr index cc2ece70e365..1e55b56a0f89 100644 --- a/Documentation/ABI/testing/sysfs-firmware-opal-psr +++ b/Documentation/ABI/testing/sysfs-firmware-opal-psr @@ -1,6 +1,6 @@ What: /sys/firmware/opal/psr Date: August 2017 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: Power-Shift-Ratio directory for Powernv P9 servers Power-Shift-Ratio allows to provide hints the firmware @@ -10,7 +10,7 @@ Description: Power-Shift-Ratio directory for Powernv P9 servers What: /sys/firmware/opal/psr/cpu_to_gpu_X Date: August 2017 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: PSR sysfs attributes for Powernv P9 servers Power-Shift-Ratio between CPU and GPU for a given chip diff --git a/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups index 3a2dfe542e8c..fcb1fb4795b6 100644 --- a/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups +++ b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups @@ -1,6 +1,6 @@ What: /sys/firmware/opal/sensor_groups Date: August 2017 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: Sensor groups directory for POWER9 powernv servers Each folder in this directory contains a sensor group @@ -11,7 +11,7 @@ Description: Sensor groups directory for POWER9 powernv servers What: /sys/firmware/opal/sensor_groups//clear Date: August 2017 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: Sysfs file to clear the min-max of all the sensors belonging to the group. diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info index 141a6b371469..f5cefb81ac9d 100644 --- a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info @@ -1,6 +1,6 @@ What: /sys/firmware/papr/energy_scale_info Date: February 2022 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: Directory hosting a set of platform attributes like energy/frequency on Linux running as a PAPR guest. @@ -10,20 +10,20 @@ Description: Directory hosting a set of platform attributes like What: /sys/firmware/papr/energy_scale_info/ Date: February 2022 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: Energy, frequency attributes directory for POWERVM servers What: /sys/firmware/papr/energy_scale_info//desc Date: February 2022 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: String description of the energy attribute of What: /sys/firmware/papr/energy_scale_info//value Date: February 2022 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: Numeric value of the energy attribute of What: /sys/firmware/papr/energy_scale_info//value_desc Date: February 2022 -Contact: Linux for PowerPC mailing list +Contact: Linux for PowerPC mailing list Description: String value of the energy attribute of From 2ecfe59cd7de1f202e9af2516a61fbbf93d0bd4d Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 2 May 2024 23:02:04 +0530 Subject: [PATCH 49/85] powerpc/64/bpf: fix tail calls for PCREL addressing With PCREL addressing, there is no kernel TOC. So, it is not setup in prologue when PCREL addressing is used. But the number of instructions to skip on a tail call was not adjusted accordingly. That resulted in not so obvious failures while using tailcalls. 'tailcalls' selftest crashed the system with the below call trace: bpf_test_run+0xe8/0x3cc (unreliable) bpf_prog_test_run_skb+0x348/0x778 __sys_bpf+0xb04/0x2b00 sys_bpf+0x28/0x38 system_call_exception+0x168/0x340 system_call_vectored_common+0x15c/0x2ec Also, as bpf programs are always module addresses and a bpf helper in general is a core kernel text address, using PC relative addressing often fails with "out of range of pcrel address" error. Switch to using kernel base for relative addressing to handle this better. Fixes: 7e3a68be42e1 ("powerpc/64: vmlinux support building with PCREL addresing") Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240502173205.142794-1-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp64.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 79f23974a320..4de08e35e284 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -202,7 +202,8 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) EMIT(PPC_RAW_BLR()); } -static int bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, u64 func) +static int +bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) { unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0; long reladdr; @@ -211,19 +212,20 @@ static int bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, u return -EINVAL; if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) { - reladdr = func_addr - CTX_NIA(ctx); + reladdr = func_addr - local_paca->kernelbase; if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) { - pr_err("eBPF: address of %ps out of range of pcrel address.\n", - (void *)func); + pr_err("eBPF: address of %ps out of range of 34-bit relative address.\n", + (void *)func); return -ERANGE; } - /* pla r12,addr */ - EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(1) | IMM_H18(reladdr)); - EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | IMM_L(reladdr)); - EMIT(PPC_RAW_MTCTR(_R12)); - EMIT(PPC_RAW_BCTR()); - + EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernelbase))); + /* Align for subsequent prefix instruction */ + if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8)) + EMIT(PPC_RAW_NOP()); + /* paddi r12,r12,addr */ + EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(0) | IMM_H18(reladdr)); + EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | ___PPC_RA(_R12) | IMM_L(reladdr)); } else { reladdr = func_addr - kernel_toc_addr(); if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { @@ -233,9 +235,9 @@ static int bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, u EMIT(PPC_RAW_ADDIS(_R12, _R2, PPC_HA(reladdr))); EMIT(PPC_RAW_ADDI(_R12, _R12, PPC_LO(reladdr))); - EMIT(PPC_RAW_MTCTR(_R12)); - EMIT(PPC_RAW_BCTRL()); } + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_BCTRL()); return 0; } @@ -285,7 +287,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o int b2p_index = bpf_to_ppc(BPF_REG_3); int bpf_tailcall_prologue_size = 8; - if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) + if (!IS_ENABLED(CONFIG_PPC_KERNEL_PCREL) && IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) bpf_tailcall_prologue_size += 4; /* skip past the toc load */ /* @@ -993,7 +995,7 @@ emit_clear: return ret; if (func_addr_fixed) - ret = bpf_jit_emit_func_call_hlp(image, ctx, func_addr); + ret = bpf_jit_emit_func_call_hlp(image, fimage, ctx, func_addr); else ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr); From 61688a82e047a4166436bf2665716cc070572ffa Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 2 May 2024 23:02:05 +0530 Subject: [PATCH 50/85] powerpc/bpf: enable kfunc call Currently, bpf jit code on powerpc assumes all the bpf functions and helpers to be part of core kernel text. This is false for kfunc case, as function addresses may not be part of core kernel text area. So, add support for addresses that are not within core kernel text area too, to enable kfunc support. Emit instructions based on whether the function address is within core kernel text address or not, to retain optimized instruction sequence where possible. In case of PCREL, as a bpf function that is not within core kernel text area is likely to go out of range with relative addressing on kernel base, use PC relative addressing. If that goes out of range, load the full address with PPC_LI64(). With addresses that are not within core kernel text area supported, override bpf_jit_supports_kfunc_call() to enable kfunc support. Also, override bpf_jit_supports_far_kfunc_call() to enable 64-bit pointers, as an address offset can be more than 32-bit long on PPC64. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240502173205.142794-2-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp.c | 10 +++++ arch/powerpc/net/bpf_jit_comp64.c | 61 ++++++++++++++++++++++++++----- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 0f9a21783329..984655419da5 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -359,3 +359,13 @@ void bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } + +bool bpf_jit_supports_kfunc_call(void) +{ + return true; +} + +bool bpf_jit_supports_far_kfunc_call(void) +{ + return IS_ENABLED(CONFIG_PPC64); +} diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 4de08e35e284..8afc14a4a125 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -208,17 +208,13 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0; long reladdr; - if (WARN_ON_ONCE(!core_kernel_text(func_addr))) + if (WARN_ON_ONCE(!kernel_text_address(func_addr))) return -EINVAL; - if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) { - reladdr = func_addr - local_paca->kernelbase; +#ifdef CONFIG_PPC_KERNEL_PCREL + reladdr = func_addr - local_paca->kernelbase; - if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) { - pr_err("eBPF: address of %ps out of range of 34-bit relative address.\n", - (void *)func); - return -ERANGE; - } + if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) { EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernelbase))); /* Align for subsequent prefix instruction */ if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8)) @@ -227,6 +223,26 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(0) | IMM_H18(reladdr)); EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | ___PPC_RA(_R12) | IMM_L(reladdr)); } else { + unsigned long pc = (unsigned long)fimage + CTX_NIA(ctx); + bool alignment_needed = !IS_ALIGNED(pc, 8); + + reladdr = func_addr - (alignment_needed ? pc + 4 : pc); + + if (reladdr < (long)SZ_8G && reladdr >= -(long)SZ_8G) { + if (alignment_needed) + EMIT(PPC_RAW_NOP()); + /* pla r12,addr */ + EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(1) | IMM_H18(reladdr)); + EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | IMM_L(reladdr)); + } else { + /* We can clobber r12 */ + PPC_LI64(_R12, func); + } + } + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_BCTRL()); +#else + if (core_kernel_text(func_addr)) { reladdr = func_addr - kernel_toc_addr(); if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { pr_err("eBPF: address of %ps out of range of kernel_toc.\n", (void *)func); @@ -235,9 +251,34 @@ bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, EMIT(PPC_RAW_ADDIS(_R12, _R2, PPC_HA(reladdr))); EMIT(PPC_RAW_ADDI(_R12, _R12, PPC_LO(reladdr))); + EMIT(PPC_RAW_MTCTR(_R12)); + EMIT(PPC_RAW_BCTRL()); + } else { + if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1)) { + /* func points to the function descriptor */ + PPC_LI64(bpf_to_ppc(TMP_REG_2), func); + /* Load actual entry point from function descriptor */ + EMIT(PPC_RAW_LD(bpf_to_ppc(TMP_REG_1), bpf_to_ppc(TMP_REG_2), 0)); + /* ... and move it to CTR */ + EMIT(PPC_RAW_MTCTR(bpf_to_ppc(TMP_REG_1))); + /* + * Load TOC from function descriptor at offset 8. + * We can clobber r2 since we get called through a + * function pointer (so caller will save/restore r2). + */ + EMIT(PPC_RAW_LD(_R2, bpf_to_ppc(TMP_REG_2), 8)); + } else { + PPC_LI64(_R12, func); + EMIT(PPC_RAW_MTCTR(_R12)); + } + EMIT(PPC_RAW_BCTRL()); + /* + * Load r2 with kernel TOC as kernel TOC is used if function address falls + * within core kernel text. + */ + EMIT(PPC_RAW_LD(_R2, _R13, offsetof(struct paca_struct, kernel_toc))); } - EMIT(PPC_RAW_MTCTR(_R12)); - EMIT(PPC_RAW_BCTRL()); +#endif return 0; } From 03c0f2c2b2220fc9cf8785cd7b61d3e71e24a366 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 3 May 2024 17:56:18 +1000 Subject: [PATCH 51/85] powerpc/io: Avoid clang null pointer arithmetic warnings With -Wextra clang warns about pointer arithmetic using a null pointer. When building with CONFIG_PCI=n, that triggers a warning in the IO accessors, eg: In file included from linux/arch/powerpc/include/asm/io.h:672: linux/arch/powerpc/include/asm/io-defs.h:23:1: warning: performing pointer arithmetic on a null pointer has undefined behavior [-Wnull-pointer-arithmetic] 23 | DEF_PCI_AC_RET(inb, u8, (unsigned long port), (port), pio, port) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ... linux/arch/powerpc/include/asm/io.h:591:53: note: expanded from macro '__do_inb' 591 | #define __do_inb(port) readb((PCI_IO_ADDR)_IO_BASE + port); | ~~~~~~~~~~~~~~~~~~~~~ ^ That is because when CONFIG_PCI=n, _IO_BASE is defined as 0. Although _IO_BASE is defined as plain 0, the cast (PCI_IO_ADDR) converts it to void * before the addition with port happens. Instead the addition can be done first, and then the cast. The resulting value will be the same, but avoids the warning, and also avoids void pointer arithmetic which is apparently non-standard. Reported-by: Naresh Kamboju Closes: https://lore.kernel.org/all/CA+G9fYtEh8zmq8k8wE-8RZwW-Qr927RLTn+KqGnq1F=ptaaNsA@mail.gmail.com Signed-off-by: Michael Ellerman Link: https://msgid.link/20240503075619.394467-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/io.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 08c550ed49be..ba2e13bb879d 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -585,12 +585,12 @@ __do_out_asm(_rec_outl, "stwbrx") #define __do_inw(port) _rec_inw(port) #define __do_inl(port) _rec_inl(port) #else /* CONFIG_PPC32 */ -#define __do_outb(val, port) writeb(val,(PCI_IO_ADDR)_IO_BASE+port); -#define __do_outw(val, port) writew(val,(PCI_IO_ADDR)_IO_BASE+port); -#define __do_outl(val, port) writel(val,(PCI_IO_ADDR)_IO_BASE+port); -#define __do_inb(port) readb((PCI_IO_ADDR)_IO_BASE + port); -#define __do_inw(port) readw((PCI_IO_ADDR)_IO_BASE + port); -#define __do_inl(port) readl((PCI_IO_ADDR)_IO_BASE + port); +#define __do_outb(val, port) writeb(val,(PCI_IO_ADDR)(_IO_BASE+port)); +#define __do_outw(val, port) writew(val,(PCI_IO_ADDR)(_IO_BASE+port)); +#define __do_outl(val, port) writel(val,(PCI_IO_ADDR)(_IO_BASE+port)); +#define __do_inb(port) readb((PCI_IO_ADDR)(_IO_BASE + port)); +#define __do_inw(port) readw((PCI_IO_ADDR)(_IO_BASE + port)); +#define __do_inl(port) readl((PCI_IO_ADDR)(_IO_BASE + port)); #endif /* !CONFIG_PPC32 */ #ifdef CONFIG_EEH @@ -606,12 +606,12 @@ __do_out_asm(_rec_outl, "stwbrx") #define __do_writesw(a, b, n) _outsw(PCI_FIX_ADDR(a),(b),(n)) #define __do_writesl(a, b, n) _outsl(PCI_FIX_ADDR(a),(b),(n)) -#define __do_insb(p, b, n) readsb((PCI_IO_ADDR)_IO_BASE+(p), (b), (n)) -#define __do_insw(p, b, n) readsw((PCI_IO_ADDR)_IO_BASE+(p), (b), (n)) -#define __do_insl(p, b, n) readsl((PCI_IO_ADDR)_IO_BASE+(p), (b), (n)) -#define __do_outsb(p, b, n) writesb((PCI_IO_ADDR)_IO_BASE+(p),(b),(n)) -#define __do_outsw(p, b, n) writesw((PCI_IO_ADDR)_IO_BASE+(p),(b),(n)) -#define __do_outsl(p, b, n) writesl((PCI_IO_ADDR)_IO_BASE+(p),(b),(n)) +#define __do_insb(p, b, n) readsb((PCI_IO_ADDR)(_IO_BASE+(p)), (b), (n)) +#define __do_insw(p, b, n) readsw((PCI_IO_ADDR)(_IO_BASE+(p)), (b), (n)) +#define __do_insl(p, b, n) readsl((PCI_IO_ADDR)(_IO_BASE+(p)), (b), (n)) +#define __do_outsb(p, b, n) writesb((PCI_IO_ADDR)(_IO_BASE+(p)),(b),(n)) +#define __do_outsw(p, b, n) writesw((PCI_IO_ADDR)(_IO_BASE+(p)),(b),(n)) +#define __do_outsl(p, b, n) writesl((PCI_IO_ADDR)(_IO_BASE+(p)),(b),(n)) #define __do_memset_io(addr, c, n) \ _memset_io(PCI_FIX_ADDR(addr), c, n) From be140f1732b523947425aaafbe2e37b41b622d96 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 3 May 2024 17:56:19 +1000 Subject: [PATCH 52/85] powerpc/64: Set _IO_BASE to POISON_POINTER_DELTA not 0 for CONFIG_PCI=n There is code that builds with calls to IO accessors even when CONFIG_PCI=n, but the actual calls are guarded by runtime checks. If not those calls would be faulting, because the page at virtual address zero is (usually) not mapped into the kernel. As Arnd pointed out, it is possible a large port value could cause the address to be above mmap_min_addr which would then access userspace, which would be a bug. To avoid any such issues, set _IO_BASE to POISON_POINTER_DELTA. That is a value chosen to point into unmapped space between the kernel and userspace, so any access will always fault. Note that on 32-bit POISON_POINTER_DELTA is 0, so the patch only has an effect on 64-bit. Signed-off-by: Michael Ellerman Link: https://msgid.link/20240503075619.394467-2-mpe@ellerman.id.au --- arch/powerpc/include/asm/io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index ba2e13bb879d..048e3705af20 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -37,7 +37,7 @@ extern struct pci_dev *isa_bridge_pcidev; * define properly based on the platform */ #ifndef CONFIG_PCI -#define _IO_BASE 0 +#define _IO_BASE POISON_POINTER_DELTA #define _ISA_MEM_BASE 0 #define PCI_DRAM_OFFSET 0 #elif defined(CONFIG_PPC32) From 8ecf3c1dab1c675721d3d0255556abe2306fa340 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 5 Mar 2024 16:36:23 +0100 Subject: [PATCH 53/85] powerpc/bpf/32: Fix failing test_bpf tests Recent additions in BPF like cpu v4 instructions, test_bpf module exhibits the following failures: test_bpf: #82 ALU_MOVSX | BPF_B jited:1 ret 2 != 1 (0x2 != 0x1)FAIL (1 times) test_bpf: #83 ALU_MOVSX | BPF_H jited:1 ret 2 != 1 (0x2 != 0x1)FAIL (1 times) test_bpf: #84 ALU64_MOVSX | BPF_B jited:1 ret 2 != 1 (0x2 != 0x1)FAIL (1 times) test_bpf: #85 ALU64_MOVSX | BPF_H jited:1 ret 2 != 1 (0x2 != 0x1)FAIL (1 times) test_bpf: #86 ALU64_MOVSX | BPF_W jited:1 ret 2 != 1 (0x2 != 0x1)FAIL (1 times) test_bpf: #165 ALU_SDIV_X: -6 / 2 = -3 jited:1 ret 2147483645 != -3 (0x7ffffffd != 0xfffffffd)FAIL (1 times) test_bpf: #166 ALU_SDIV_K: -6 / 2 = -3 jited:1 ret 2147483645 != -3 (0x7ffffffd != 0xfffffffd)FAIL (1 times) test_bpf: #169 ALU_SMOD_X: -7 % 2 = -1 jited:1 ret 1 != -1 (0x1 != 0xffffffff)FAIL (1 times) test_bpf: #170 ALU_SMOD_K: -7 % 2 = -1 jited:1 ret 1 != -1 (0x1 != 0xffffffff)FAIL (1 times) test_bpf: #172 ALU64_SMOD_K: -7 % 2 = -1 jited:1 ret 1 != -1 (0x1 != 0xffffffff)FAIL (1 times) test_bpf: #313 BSWAP 16: 0x0123456789abcdef -> 0xefcd eBPF filter opcode 00d7 (@2) unsupported jited:0 301 PASS test_bpf: #314 BSWAP 32: 0x0123456789abcdef -> 0xefcdab89 eBPF filter opcode 00d7 (@2) unsupported jited:0 555 PASS test_bpf: #315 BSWAP 64: 0x0123456789abcdef -> 0x67452301 eBPF filter opcode 00d7 (@2) unsupported jited:0 268 PASS test_bpf: #316 BSWAP 64: 0x0123456789abcdef >> 32 -> 0xefcdab89 eBPF filter opcode 00d7 (@2) unsupported jited:0 269 PASS test_bpf: #317 BSWAP 16: 0xfedcba9876543210 -> 0x1032 eBPF filter opcode 00d7 (@2) unsupported jited:0 460 PASS test_bpf: #318 BSWAP 32: 0xfedcba9876543210 -> 0x10325476 eBPF filter opcode 00d7 (@2) unsupported jited:0 320 PASS test_bpf: #319 BSWAP 64: 0xfedcba9876543210 -> 0x98badcfe eBPF filter opcode 00d7 (@2) unsupported jited:0 222 PASS test_bpf: #320 BSWAP 64: 0xfedcba9876543210 >> 32 -> 0x10325476 eBPF filter opcode 00d7 (@2) unsupported jited:0 273 PASS test_bpf: #344 BPF_LDX_MEMSX | BPF_B eBPF filter opcode 0091 (@5) unsupported jited:0 432 PASS test_bpf: #345 BPF_LDX_MEMSX | BPF_H eBPF filter opcode 0089 (@5) unsupported jited:0 381 PASS test_bpf: #346 BPF_LDX_MEMSX | BPF_W eBPF filter opcode 0081 (@5) unsupported jited:0 505 PASS test_bpf: #490 JMP32_JA: Unconditional jump: if (true) return 1 eBPF filter opcode 0006 (@1) unsupported jited:0 261 PASS test_bpf: Summary: 1040 PASSED, 10 FAILED, [924/1038 JIT'ed] Fix them by adding missing processing. Fixes: daabb2b098e0 ("bpf/tests: add tests for cpuv4 instructions") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/91de862dda99d170697eb79ffb478678af7e0b27.1709652689.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc-opcode.h | 4 + arch/powerpc/net/bpf_jit_comp32.c | 137 ++++++++++++++++++++------ 2 files changed, 110 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 005601243dda..076ae60b4a55 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -510,6 +510,7 @@ #define PPC_RAW_STB(r, base, i) (0x98000000 | ___PPC_RS(r) | ___PPC_RA(base) | IMM_L(i)) #define PPC_RAW_LBZ(r, base, i) (0x88000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) #define PPC_RAW_LDX(r, base, b) (0x7c00002a | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) +#define PPC_RAW_LHA(r, base, i) (0xa8000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) #define PPC_RAW_LHZ(r, base, i) (0xa0000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) #define PPC_RAW_LHBRX(r, base, b) (0x7c00062c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_RAW_LWBRX(r, base, b) (0x7c00042c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) @@ -532,6 +533,7 @@ #define PPC_RAW_MULW(d, a, b) (0x7c0001d6 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_MULHWU(d, a, b) (0x7c000016 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_MULI(d, a, i) (0x1c000000 | ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i)) +#define PPC_RAW_DIVW(d, a, b) (0x7c0003d6 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_DIVWU(d, a, b) (0x7c000396 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_DIVDU(d, a, b) (0x7c000392 | ___PPC_RT(d) | ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_RAW_DIVDE(t, a, b) (0x7c000352 | ___PPC_RT(t) | ___PPC_RA(a) | ___PPC_RB(b)) @@ -550,6 +552,8 @@ #define PPC_RAW_XOR(d, a, b) (0x7c000278 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(b)) #define PPC_RAW_XORI(d, a, i) (0x68000000 | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) #define PPC_RAW_XORIS(d, a, i) (0x6c000000 | ___PPC_RA(d) | ___PPC_RS(a) | IMM_L(i)) +#define PPC_RAW_EXTSB(d, a) (0x7c000774 | ___PPC_RA(d) | ___PPC_RS(a)) +#define PPC_RAW_EXTSH(d, a) (0x7c000734 | ___PPC_RA(d) | ___PPC_RS(a)) #define PPC_RAW_EXTSW(d, a) (0x7c0007b4 | ___PPC_RA(d) | ___PPC_RS(a)) #define PPC_RAW_SLW(d, a, s) (0x7c000030 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(s)) #define PPC_RAW_SLD(d, a, s) (0x7c000036 | ___PPC_RA(d) | ___PPC_RS(a) | ___PPC_RB(s)) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 2f39c50ca729..43b97032a91c 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -450,10 +450,16 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code } break; case BPF_ALU | BPF_DIV | BPF_X: /* (u32) dst /= (u32) src */ - EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, src_reg)); + if (off) + EMIT(PPC_RAW_DIVW(dst_reg, src2_reg, src_reg)); + else + EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, src_reg)); break; case BPF_ALU | BPF_MOD | BPF_X: /* (u32) dst %= (u32) src */ - EMIT(PPC_RAW_DIVWU(_R0, src2_reg, src_reg)); + if (off) + EMIT(PPC_RAW_DIVW(_R0, src2_reg, src_reg)); + else + EMIT(PPC_RAW_DIVWU(_R0, src2_reg, src_reg)); EMIT(PPC_RAW_MULW(_R0, src_reg, _R0)); EMIT(PPC_RAW_SUB(dst_reg, src2_reg, _R0)); break; @@ -467,10 +473,16 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code if (imm == 1) { EMIT(PPC_RAW_MR(dst_reg, src2_reg)); } else if (is_power_of_2((u32)imm)) { - EMIT(PPC_RAW_SRWI(dst_reg, src2_reg, ilog2(imm))); + if (off) + EMIT(PPC_RAW_SRAWI(dst_reg, src2_reg, ilog2(imm))); + else + EMIT(PPC_RAW_SRWI(dst_reg, src2_reg, ilog2(imm))); } else { PPC_LI32(_R0, imm); - EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, _R0)); + if (off) + EMIT(PPC_RAW_DIVW(dst_reg, src2_reg, _R0)); + else + EMIT(PPC_RAW_DIVWU(dst_reg, src2_reg, _R0)); } break; case BPF_ALU | BPF_MOD | BPF_K: /* (u32) dst %= (u32) imm */ @@ -480,11 +492,19 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code if (!is_power_of_2((u32)imm)) { bpf_set_seen_register(ctx, tmp_reg); PPC_LI32(tmp_reg, imm); - EMIT(PPC_RAW_DIVWU(_R0, src2_reg, tmp_reg)); + if (off) + EMIT(PPC_RAW_DIVW(_R0, src2_reg, tmp_reg)); + else + EMIT(PPC_RAW_DIVWU(_R0, src2_reg, tmp_reg)); EMIT(PPC_RAW_MULW(_R0, tmp_reg, _R0)); EMIT(PPC_RAW_SUB(dst_reg, src2_reg, _R0)); } else if (imm == 1) { EMIT(PPC_RAW_LI(dst_reg, 0)); + } else if (off) { + EMIT(PPC_RAW_SRAWI(_R0, src2_reg, ilog2(imm))); + EMIT(PPC_RAW_ADDZE(_R0, _R0)); + EMIT(PPC_RAW_SLWI(_R0, _R0, ilog2(imm))); + EMIT(PPC_RAW_SUB(dst_reg, src2_reg, _R0)); } else { imm = ilog2((u32)imm); EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 32 - imm, 31)); @@ -497,11 +517,21 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code imm = -imm; if (!is_power_of_2(imm)) return -EOPNOTSUPP; - if (imm == 1) + if (imm == 1) { EMIT(PPC_RAW_LI(dst_reg, 0)); - else + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } else if (off) { + EMIT(PPC_RAW_SRAWI(dst_reg_h, src2_reg_h, 31)); + EMIT(PPC_RAW_XOR(dst_reg, src2_reg, dst_reg_h)); + EMIT(PPC_RAW_SUBFC(dst_reg, dst_reg_h, dst_reg)); + EMIT(PPC_RAW_RLWINM(dst_reg, dst_reg, 0, 32 - ilog2(imm), 31)); + EMIT(PPC_RAW_XOR(dst_reg, dst_reg, dst_reg_h)); + EMIT(PPC_RAW_SUBFC(dst_reg, dst_reg_h, dst_reg)); + EMIT(PPC_RAW_SUBFE(dst_reg_h, dst_reg_h, dst_reg_h)); + } else { EMIT(PPC_RAW_RLWINM(dst_reg, src2_reg, 0, 32 - ilog2(imm), 31)); - EMIT(PPC_RAW_LI(dst_reg_h, 0)); + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } break; case BPF_ALU64 | BPF_DIV | BPF_K: /* dst /= imm */ if (!imm) @@ -727,15 +757,30 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code * MOV */ case BPF_ALU64 | BPF_MOV | BPF_X: /* dst = src */ - if (dst_reg == src_reg) - break; - EMIT(PPC_RAW_MR(dst_reg, src_reg)); - EMIT(PPC_RAW_MR(dst_reg_h, src_reg_h)); + if (off == 8) { + EMIT(PPC_RAW_EXTSB(dst_reg, src_reg)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg, 31)); + } else if (off == 16) { + EMIT(PPC_RAW_EXTSH(dst_reg, src_reg)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg, 31)); + } else if (off == 32 && dst_reg == src_reg) { + EMIT(PPC_RAW_SRAWI(dst_reg_h, src_reg, 31)); + } else if (off == 32) { + EMIT(PPC_RAW_MR(dst_reg, src_reg)); + EMIT(PPC_RAW_SRAWI(dst_reg_h, src_reg, 31)); + } else if (dst_reg != src_reg) { + EMIT(PPC_RAW_MR(dst_reg, src_reg)); + EMIT(PPC_RAW_MR(dst_reg_h, src_reg_h)); + } break; case BPF_ALU | BPF_MOV | BPF_X: /* (u32) dst = src */ /* special mov32 for zext */ if (imm == 1) EMIT(PPC_RAW_LI(dst_reg_h, 0)); + else if (off == 8) + EMIT(PPC_RAW_EXTSB(dst_reg, src_reg)); + else if (off == 16) + EMIT(PPC_RAW_EXTSH(dst_reg, src_reg)); else if (dst_reg != src_reg) EMIT(PPC_RAW_MR(dst_reg, src_reg)); break; @@ -751,6 +796,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code * BPF_FROM_BE/LE */ case BPF_ALU | BPF_END | BPF_FROM_LE: + case BPF_ALU64 | BPF_END | BPF_FROM_LE: switch (imm) { case 16: /* Copy 16 bits to upper part */ @@ -785,6 +831,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code EMIT(PPC_RAW_MR(dst_reg_h, tmp_reg)); break; } + if (BPF_CLASS(code) == BPF_ALU64 && imm != 64) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); break; case BPF_ALU | BPF_END | BPF_FROM_BE: switch (imm) { @@ -918,11 +966,17 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code * BPF_LDX */ case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */ + case BPF_LDX | BPF_MEMSX | BPF_B: case BPF_LDX | BPF_PROBE_MEM | BPF_B: + case BPF_LDX | BPF_PROBE_MEMSX | BPF_B: case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */ + case BPF_LDX | BPF_MEMSX | BPF_H: case BPF_LDX | BPF_PROBE_MEM | BPF_H: + case BPF_LDX | BPF_PROBE_MEMSX | BPF_H: case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */ + case BPF_LDX | BPF_MEMSX | BPF_W: case BPF_LDX | BPF_PROBE_MEM | BPF_W: + case BPF_LDX | BPF_PROBE_MEMSX | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */ case BPF_LDX | BPF_PROBE_MEM | BPF_DW: /* @@ -931,7 +985,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code * load only if addr is kernel address (see is_kernel_addr()), otherwise * set dst_reg=0 and move on. */ - if (BPF_MODE(code) == BPF_PROBE_MEM) { + if (BPF_MODE(code) == BPF_PROBE_MEM || BPF_MODE(code) == BPF_PROBE_MEMSX) { PPC_LI32(_R0, TASK_SIZE - off); EMIT(PPC_RAW_CMPLW(src_reg, _R0)); PPC_BCC_SHORT(COND_GT, (ctx->idx + 4) * 4); @@ -953,30 +1007,48 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code * as there are two load instructions for dst_reg_h & dst_reg * respectively. */ - if (size == BPF_DW) + if (size == BPF_DW || + (size == BPF_B && BPF_MODE(code) == BPF_PROBE_MEMSX)) PPC_JMP((ctx->idx + 3) * 4); else PPC_JMP((ctx->idx + 2) * 4); } - switch (size) { - case BPF_B: - EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); - break; - case BPF_H: - EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); - break; - case BPF_W: - EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); - break; - case BPF_DW: - EMIT(PPC_RAW_LWZ(dst_reg_h, src_reg, off)); - EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off + 4)); - break; - } + if (BPF_MODE(code) == BPF_MEMSX || BPF_MODE(code) == BPF_PROBE_MEMSX) { + switch (size) { + case BPF_B: + EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); + EMIT(PPC_RAW_EXTSB(dst_reg, dst_reg)); + break; + case BPF_H: + EMIT(PPC_RAW_LHA(dst_reg, src_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); + break; + } + if (!fp->aux->verifier_zext) + EMIT(PPC_RAW_SRAWI(dst_reg_h, dst_reg, 31)); - if (size != BPF_DW && !fp->aux->verifier_zext) - EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } else { + switch (size) { + case BPF_B: + EMIT(PPC_RAW_LBZ(dst_reg, src_reg, off)); + break; + case BPF_H: + EMIT(PPC_RAW_LHZ(dst_reg, src_reg, off)); + break; + case BPF_W: + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off)); + break; + case BPF_DW: + EMIT(PPC_RAW_LWZ(dst_reg_h, src_reg, off)); + EMIT(PPC_RAW_LWZ(dst_reg, src_reg, off + 4)); + break; + } + if (size != BPF_DW && !fp->aux->verifier_zext) + EMIT(PPC_RAW_LI(dst_reg_h, 0)); + } if (BPF_MODE(code) == BPF_PROBE_MEM) { int insn_idx = ctx->idx - 1; @@ -1068,6 +1140,9 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct code case BPF_JMP | BPF_JA: PPC_JMP(addrs[i + 1 + off]); break; + case BPF_JMP32 | BPF_JA: + PPC_JMP(addrs[i + 1 + imm]); + break; case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: From 4f1dad618587fa2fa903235301111c8c382b6f3e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 16 Feb 2024 22:55:17 +0900 Subject: [PATCH 54/85] powerpc: remove unused *_syscall_64.o variables in Makefile Commit ab1a517d55b0 ("powerpc/syscall: Rename syscall_64.c into interrupt.c") missed to update these three lines: GCOV_PROFILE_syscall_64.o := n KCOV_INSTRUMENT_syscall_64.o := n UBSAN_SANITIZE_syscall_64.o := n To restore the original behavior, we could replace them with: GCOV_PROFILE_interrupt.o := n KCOV_INSTRUMENT_interrupt.o := n UBSAN_SANITIZE_interrupt.o := n However, nobody has noticed the functional change in the past three years, so they were unneeded. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman Link: https://msgid.link/20240216135517.2002749-1-masahiroy@kernel.org --- arch/powerpc/kernel/Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 1d183b077948..f5dd2d65cdbe 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -191,9 +191,6 @@ GCOV_PROFILE_kprobes-ftrace.o := n KCOV_INSTRUMENT_kprobes-ftrace.o := n KCSAN_SANITIZE_kprobes-ftrace.o := n UBSAN_SANITIZE_kprobes-ftrace.o := n -GCOV_PROFILE_syscall_64.o := n -KCOV_INSTRUMENT_syscall_64.o := n -UBSAN_SANITIZE_syscall_64.o := n UBSAN_SANITIZE_vdso.o := n # Necessary for booting with kcov enabled on book3e machines From 6efc2f1a64ef62f1e3893da90d6ac618988992c2 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 8 Apr 2024 16:39:16 +0800 Subject: [PATCH 55/85] powerpc: boot: Fix kernel-doc param for partial_decompress Fix the kernel-doc annotation for the 'skip' parameter in the partial_decompress() function by adding a missing underscore and colon. Signed-off-by: Yang Li Reviewed-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://msgid.link/20240408083916.123369-1-yang.lee@linux.alibaba.com --- arch/powerpc/boot/decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c index 977eb15a6d17..6835cb53f034 100644 --- a/arch/powerpc/boot/decompress.c +++ b/arch/powerpc/boot/decompress.c @@ -101,7 +101,7 @@ static void print_err(char *s) * @input_size: length of the input buffer * @outbuf: output buffer * @output_size: length of the output buffer - * @skip number of output bytes to ignore + * @_skip: number of output bytes to ignore * * This function takes compressed data from inbuf, decompresses and write it to * outbuf. Once output_size bytes are written to the output buffer, or the From 97bd2693b399cfd436acaa230d8f09e4c39e8e5c Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 8 Apr 2024 13:31:08 +0800 Subject: [PATCH 56/85] powerpc: Fix kernel-doc comments in fsl_gtm.c Fix some function names in kernel-doc comments. Signed-off-by: Yang Li Reviewed-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://msgid.link/20240408053109.96360-1-yang.lee@linux.alibaba.com --- arch/powerpc/sysdev/fsl_gtm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_gtm.c b/arch/powerpc/sysdev/fsl_gtm.c index 39186ad6b3c3..3dabc9621810 100644 --- a/arch/powerpc/sysdev/fsl_gtm.c +++ b/arch/powerpc/sysdev/fsl_gtm.c @@ -77,7 +77,7 @@ struct gtm { static LIST_HEAD(gtms); /** - * gtm_get_timer - request GTM timer to use it with the rest of GTM API + * gtm_get_timer16 - request GTM timer to use it with the rest of GTM API * Context: non-IRQ * * This function reserves GTM timer for later use. It returns gtm_timer @@ -110,7 +110,7 @@ struct gtm_timer *gtm_get_timer16(void) EXPORT_SYMBOL(gtm_get_timer16); /** - * gtm_get_specific_timer - request specific GTM timer + * gtm_get_specific_timer16 - request specific GTM timer * @gtm: specific GTM, pass here GTM's device_node->data * @timer: specific timer number, Timer1 is 0. * Context: non-IRQ @@ -260,7 +260,7 @@ int gtm_set_timer16(struct gtm_timer *tmr, unsigned long usec, bool reload) EXPORT_SYMBOL(gtm_set_timer16); /** - * gtm_set_exact_utimer16 - (re)set 16 bits timer + * gtm_set_exact_timer16 - (re)set 16 bits timer * @tmr: pointer to the gtm_timer structure obtained from gtm_get_timer * @usec: timer interval in microseconds * @reload: if set, the timer will reset upon expiry rather than From 554da5e0f71238384787954242d881cfeeff844d Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 8 Apr 2024 13:31:09 +0800 Subject: [PATCH 57/85] powerpc/rtas: Add kernel-doc comments to smp_startup_cpu() This commit adds kernel-doc style comments with complete parameter descriptions for the function smp_startup_cpu(). Signed-off-by: Yang Li Acked-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://msgid.link/20240408053109.96360-2-yang.lee@linux.alibaba.com --- arch/powerpc/platforms/cell/smp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index 30394c6f8894..fee638fd8970 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -54,6 +54,7 @@ static cpumask_t of_spin_map; /** * smp_startup_cpu() - start the given cpu + * @lcpu: Logical CPU ID of the CPU to be started. * * At boot time, there is nothing to do for primary threads which were * started from Open Firmware. For anything else, call RTAS with the From b12ba096b89084d1e2d6ebdb71b852eeebef95d3 Mon Sep 17 00:00:00 2001 From: Ran Wang Date: Fri, 19 Jan 2024 15:38:54 -0500 Subject: [PATCH 58/85] powerpc: dts: add power management nodes to FSL chips Enable Power Management feature on device tree, including MPC8536, MPC8544, MPC8548, MPC8572, P1010, P1020, P1021, P1022, P2020, P2041, P3041, T104X, T1024. Signed-off-by: Zhao Chenhui Signed-off-by: Ran Wang Signed-off-by: Frank Li Signed-off-by: Michael Ellerman Link: https://msgid.link/20240119203911.3143928-1-Frank.Li@nxp.com --- arch/powerpc/boot/dts/fsl/mpc8536si-post.dtsi | 14 ++++++++++++-- arch/powerpc/boot/dts/fsl/mpc8544si-post.dtsi | 2 ++ arch/powerpc/boot/dts/fsl/mpc8548si-post.dtsi | 2 ++ arch/powerpc/boot/dts/fsl/mpc8572si-post.dtsi | 2 ++ arch/powerpc/boot/dts/fsl/p1010si-post.dtsi | 14 ++++++++++++++ arch/powerpc/boot/dts/fsl/p1020si-post.dtsi | 5 +++++ arch/powerpc/boot/dts/fsl/p1021si-post.dtsi | 5 +++++ arch/powerpc/boot/dts/fsl/p1022si-post.dtsi | 7 +++++-- arch/powerpc/boot/dts/fsl/p2020si-post.dtsi | 17 +++++++++++++---- arch/powerpc/boot/dts/fsl/pq3-power.dtsi | 19 +++++++++++++++++++ arch/powerpc/boot/dts/fsl/t1024rdb.dts | 2 +- arch/powerpc/boot/dts/fsl/t1040rdb.dts | 2 +- arch/powerpc/boot/dts/fsl/t1042rdb.dts | 2 +- arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts | 2 +- 14 files changed, 83 insertions(+), 12 deletions(-) create mode 100644 arch/powerpc/boot/dts/fsl/pq3-power.dtsi diff --git a/arch/powerpc/boot/dts/fsl/mpc8536si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8536si-post.dtsi index 41935709ebe8..fba40a1bccc0 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8536si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8536si-post.dtsi @@ -199,6 +199,10 @@ /include/ "pq3-dma-0.dtsi" /include/ "pq3-etsec1-0.dtsi" + enet0: ethernet@24000 { + fsl,wake-on-filer; + fsl,pmc-handle = <&etsec1_clk>; + }; /include/ "pq3-etsec1-timer-0.dtsi" usb@22000 { @@ -222,9 +226,10 @@ }; /include/ "pq3-etsec1-2.dtsi" - - ethernet@26000 { + enet2: ethernet@26000 { cell-index = <1>; + fsl,wake-on-filer; + fsl,pmc-handle = <&etsec3_clk>; }; usb@2b000 { @@ -249,4 +254,9 @@ reg = <0xe0000 0x1000>; fsl,has-rstcr; }; + +/include/ "pq3-power.dtsi" + power@e0070 { + compatible = "fsl,mpc8536-pmc", "fsl,mpc8548-pmc"; + }; }; diff --git a/arch/powerpc/boot/dts/fsl/mpc8544si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8544si-post.dtsi index b68eb119faef..ea7416af7ee3 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8544si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8544si-post.dtsi @@ -188,4 +188,6 @@ reg = <0xe0000 0x1000>; fsl,has-rstcr; }; + +/include/ "pq3-power.dtsi" }; diff --git a/arch/powerpc/boot/dts/fsl/mpc8548si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8548si-post.dtsi index 579d76cb8e32..dddb7374508d 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8548si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8548si-post.dtsi @@ -156,4 +156,6 @@ reg = <0xe0000 0x1000>; fsl,has-rstcr; }; + +/include/ "pq3-power.dtsi" }; diff --git a/arch/powerpc/boot/dts/fsl/mpc8572si-post.dtsi b/arch/powerpc/boot/dts/fsl/mpc8572si-post.dtsi index 49294cf36b4e..40a6cff77032 100644 --- a/arch/powerpc/boot/dts/fsl/mpc8572si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/mpc8572si-post.dtsi @@ -193,4 +193,6 @@ reg = <0xe0000 0x1000>; fsl,has-rstcr; }; + +/include/ "pq3-power.dtsi" }; diff --git a/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi index ccda0a91abf0..b540e58ff79e 100644 --- a/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi @@ -183,9 +183,23 @@ /include/ "pq3-etsec2-1.dtsi" /include/ "pq3-etsec2-2.dtsi" + enet0: ethernet@b0000 { + fsl,pmc-handle = <&etsec1_clk>; + }; + + enet1: ethernet@b1000 { + fsl,pmc-handle = <&etsec2_clk>; + }; + + enet2: ethernet@b2000 { + fsl,pmc-handle = <&etsec3_clk>; + }; + global-utilities@e0000 { compatible = "fsl,p1010-guts"; reg = <0xe0000 0x1000>; fsl,has-rstcr; }; + +/include/ "pq3-power.dtsi" }; diff --git a/arch/powerpc/boot/dts/fsl/p1020si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1020si-post.dtsi index 642dc3a83d0e..cc4c7461003b 100644 --- a/arch/powerpc/boot/dts/fsl/p1020si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1020si-post.dtsi @@ -163,14 +163,17 @@ /include/ "pq3-etsec2-0.dtsi" enet0: enet0_grp2: ethernet@b0000 { + fsl,pmc-handle = <&etsec1_clk>; }; /include/ "pq3-etsec2-1.dtsi" enet1: enet1_grp2: ethernet@b1000 { + fsl,pmc-handle = <&etsec2_clk>; }; /include/ "pq3-etsec2-2.dtsi" enet2: enet2_grp2: ethernet@b2000 { + fsl,pmc-handle = <&etsec3_clk>; }; global-utilities@e0000 { @@ -178,6 +181,8 @@ reg = <0xe0000 0x1000>; fsl,has-rstcr; }; + +/include/ "pq3-power.dtsi" }; /include/ "pq3-etsec2-grp2-0.dtsi" diff --git a/arch/powerpc/boot/dts/fsl/p1021si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1021si-post.dtsi index 407cb5fd0f5b..378195db9fca 100644 --- a/arch/powerpc/boot/dts/fsl/p1021si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1021si-post.dtsi @@ -159,14 +159,17 @@ /include/ "pq3-etsec2-0.dtsi" enet0: enet0_grp2: ethernet@b0000 { + fsl,pmc-handle = <&etsec1_clk>; }; /include/ "pq3-etsec2-1.dtsi" enet1: enet1_grp2: ethernet@b1000 { + fsl,pmc-handle = <&etsec2_clk>; }; /include/ "pq3-etsec2-2.dtsi" enet2: enet2_grp2: ethernet@b2000 { + fsl,pmc-handle = <&etsec3_clk>; }; global-utilities@e0000 { @@ -174,6 +177,8 @@ reg = <0xe0000 0x1000>; fsl,has-rstcr; }; + +/include/ "pq3-power.dtsi" }; &qe { diff --git a/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi index 093e4e3ed368..6ac21e81344a 100644 --- a/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1022si-post.dtsi @@ -225,11 +225,13 @@ /include/ "pq3-etsec2-0.dtsi" enet0: enet0_grp2: ethernet@b0000 { fsl,wake-on-filer; + fsl,pmc-handle = <&etsec1_clk>; }; /include/ "pq3-etsec2-1.dtsi" enet1: enet1_grp2: ethernet@b1000 { fsl,wake-on-filer; + fsl,pmc-handle = <&etsec2_clk>; }; global-utilities@e0000 { @@ -238,9 +240,10 @@ fsl,has-rstcr; }; +/include/ "pq3-power.dtsi" power@e0070 { - compatible = "fsl,mpc8536-pmc", "fsl,mpc8548-pmc"; - reg = <0xe0070 0x20>; + compatible = "fsl,p1022-pmc", "fsl,mpc8536-pmc", + "fsl,mpc8548-pmc"; }; }; diff --git a/arch/powerpc/boot/dts/fsl/p2020si-post.dtsi b/arch/powerpc/boot/dts/fsl/p2020si-post.dtsi index 81b9ab2119be..d410082d21c0 100644 --- a/arch/powerpc/boot/dts/fsl/p2020si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p2020si-post.dtsi @@ -178,6 +178,10 @@ compatible = "fsl-usb2-dr-v1.6", "fsl-usb2-dr"; }; /include/ "pq3-etsec1-0.dtsi" + enet0: ethernet@24000 { + fsl,pmc-handle = <&etsec1_clk>; + + }; /include/ "pq3-etsec1-timer-0.dtsi" ptp_clock@24e00 { @@ -186,7 +190,15 @@ /include/ "pq3-etsec1-1.dtsi" + enet1: ethernet@25000 { + fsl,pmc-handle = <&etsec2_clk>; + }; + /include/ "pq3-etsec1-2.dtsi" + enet2: ethernet@26000 { + fsl,pmc-handle = <&etsec3_clk>; + }; + /include/ "pq3-esdhc-0.dtsi" sdhc@2e000 { compatible = "fsl,p2020-esdhc", "fsl,esdhc"; @@ -202,8 +214,5 @@ fsl,has-rstcr; }; - pmc: power@e0070 { - compatible = "fsl,mpc8548-pmc"; - reg = <0xe0070 0x20>; - }; +/include/ "pq3-power.dtsi" }; diff --git a/arch/powerpc/boot/dts/fsl/pq3-power.dtsi b/arch/powerpc/boot/dts/fsl/pq3-power.dtsi new file mode 100644 index 000000000000..6af12401004d --- /dev/null +++ b/arch/powerpc/boot/dts/fsl/pq3-power.dtsi @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: (GPL-2.0+) +/* + * Copyright 2024 NXP + */ + +power@e0070 { + compatible = "fsl,mpc8548-pmc"; + reg = <0xe0070 0x20>; + + etsec1_clk: soc-clk@24 { + fsl,pmcdr-mask = <0x00000080>; + }; + etsec2_clk: soc-clk@25 { + fsl,pmcdr-mask = <0x00000040>; + }; + etsec3_clk: soc-clk@26 { + fsl,pmcdr-mask = <0x00000020>; + }; +}; diff --git a/arch/powerpc/boot/dts/fsl/t1024rdb.dts b/arch/powerpc/boot/dts/fsl/t1024rdb.dts index 270aaf631f2a..7d003e07a9fb 100644 --- a/arch/powerpc/boot/dts/fsl/t1024rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1024rdb.dts @@ -91,7 +91,7 @@ board-control@2,0 { #address-cells = <1>; #size-cells = <1>; - compatible = "fsl,t1024-cpld"; + compatible = "fsl,t1024-cpld", "fsl,deepsleep-cpld"; reg = <3 0 0x300>; ranges = <0 3 0 0x300>; bank-width = <1>; diff --git a/arch/powerpc/boot/dts/fsl/t1040rdb.dts b/arch/powerpc/boot/dts/fsl/t1040rdb.dts index dd3aab81e9de..4347924e9aa7 100644 --- a/arch/powerpc/boot/dts/fsl/t1040rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1040rdb.dts @@ -104,7 +104,7 @@ ifc: localbus@ffe124000 { cpld@3,0 { - compatible = "fsl,t1040rdb-cpld"; + compatible = "fsl,t104xrdb-cpld", "fsl,deepsleep-cpld"; }; }; }; diff --git a/arch/powerpc/boot/dts/fsl/t1042rdb.dts b/arch/powerpc/boot/dts/fsl/t1042rdb.dts index 3ebb712224cb..099764322b33 100644 --- a/arch/powerpc/boot/dts/fsl/t1042rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1042rdb.dts @@ -68,7 +68,7 @@ ifc: localbus@ffe124000 { cpld@3,0 { - compatible = "fsl,t1042rdb-cpld"; + compatible = "fsl,t104xrdb-cpld", "fsl,deepsleep-cpld"; }; }; }; diff --git a/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts b/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts index 8ec3ff45e6fc..b10cab1a347b 100644 --- a/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts +++ b/arch/powerpc/boot/dts/fsl/t1042rdb_pi.dts @@ -41,7 +41,7 @@ ifc: localbus@ffe124000 { cpld@3,0 { - compatible = "fsl,t1042rdb_pi-cpld"; + compatible = "fsl,t104xrdb-cpld", "fsl,deepsleep-cpld"; }; }; From 9c8dc6f34351cd0c6a2ef83be2266f7dd67c152c Mon Sep 17 00:00:00 2001 From: Xiaowei Bao Date: Fri, 19 Jan 2024 15:38:55 -0500 Subject: [PATCH 59/85] powerpc: dts: p1010rdb: fix INTx interrupt issue on P1010RDB-PB Due to the INTA is shared with the active-low PHY2 interrupt on P1010RDB-PA board, so configure P1010RDB-PA's INTA with polarity as active-low, the P1010RDB-PB board is used separately, so configure P1010RDB-PB's INTA with polarity as active-high. The INTX in P1010RDB-PB do not work because of the pcie@0 node fixup will be overwrited by p1010si-post.dtsi file, so we move the pcie@0 node fixup to p1010rdb-pb.dts and p1010rdb-pb_36b.dts. Signed-off-by: Xiaowei Bao Signed-off-by: Li Yang Signed-off-by: Frank Li Signed-off-by: Michael Ellerman Link: https://msgid.link/20240119203911.3143928-2-Frank.Li@nxp.com --- arch/powerpc/boot/dts/fsl/p1010rdb-pb.dts | 16 ++++++++++++++++ arch/powerpc/boot/dts/fsl/p1010rdb-pb_36b.dts | 16 ++++++++++++++++ arch/powerpc/boot/dts/fsl/p1010rdb.dtsi | 16 ---------------- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb-pb.dts b/arch/powerpc/boot/dts/fsl/p1010rdb-pb.dts index 3a94acbb3c03..ce3346d77858 100644 --- a/arch/powerpc/boot/dts/fsl/p1010rdb-pb.dts +++ b/arch/powerpc/boot/dts/fsl/p1010rdb-pb.dts @@ -29,3 +29,19 @@ }; /include/ "p1010si-post.dtsi" + +&pci0 { + pcie@0 { + interrupt-map = < + /* IDSEL 0x0 */ + /* + *irq[4:5] are active-high + *irq[6:7] are active-low + */ + 0000 0x0 0x0 0x1 &mpic 0x4 0x2 0x0 0x0 + 0000 0x0 0x0 0x2 &mpic 0x5 0x2 0x0 0x0 + 0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0 + 0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0 + >; + }; +}; diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb-pb_36b.dts b/arch/powerpc/boot/dts/fsl/p1010rdb-pb_36b.dts index 4cf255fedc96..83590354f9a0 100644 --- a/arch/powerpc/boot/dts/fsl/p1010rdb-pb_36b.dts +++ b/arch/powerpc/boot/dts/fsl/p1010rdb-pb_36b.dts @@ -56,3 +56,19 @@ }; /include/ "p1010si-post.dtsi" + +&pci0 { + pcie@0 { + interrupt-map = < + /* IDSEL 0x0 */ + /* + *irq[4:5] are active-high + *irq[6:7] are active-low + */ + 0000 0x0 0x0 0x1 &mpic 0x4 0x2 0x0 0x0 + 0000 0x0 0x0 0x2 &mpic 0x5 0x2 0x0 0x0 + 0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0 + 0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0 + >; + }; +}; diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi index 2ca9cee2ddeb..ef49a7d6c69d 100644 --- a/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010rdb.dtsi @@ -215,19 +215,3 @@ phy-connection-type = "sgmii"; }; }; - -&pci0 { - pcie@0 { - interrupt-map = < - /* IDSEL 0x0 */ - /* - *irq[4:5] are active-high - *irq[6:7] are active-low - */ - 0000 0x0 0x0 0x1 &mpic 0x4 0x2 0x0 0x0 - 0000 0x0 0x0 0x2 &mpic 0x5 0x2 0x0 0x0 - 0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0 - 0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0 - >; - }; -}; From 0bf51cc9e9e57a751b4c5dacbfa499ba5cd8bd72 Mon Sep 17 00:00:00 2001 From: Li Yang Date: Fri, 19 Jan 2024 15:38:56 -0500 Subject: [PATCH 60/85] powerpc: dts: mpc85xx: remove "simple-bus" compatible from ifc node Update dts to match dts binding document. Signed-off-by: Li Yang Signed-off-by: Frank Li Signed-off-by: Michael Ellerman Link: https://msgid.link/20240119203911.3143928-3-Frank.Li@nxp.com --- arch/powerpc/boot/dts/fsl/b4si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/bsc9131si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/c293si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/p1010si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/t1023si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/t1040si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/t2081si-post.dtsi | 2 +- arch/powerpc/boot/dts/fsl/t4240si-post.dtsi | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi index 4f044b41a776..fb3200b006ad 100644 --- a/arch/powerpc/boot/dts/fsl/b4si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/b4si-post.dtsi @@ -50,7 +50,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <25 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/bsc9131si-post.dtsi b/arch/powerpc/boot/dts/fsl/bsc9131si-post.dtsi index 2a677fd323eb..5c53cee8755f 100644 --- a/arch/powerpc/boot/dts/fsl/bsc9131si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/bsc9131si-post.dtsi @@ -35,7 +35,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <16 2 0 0 20 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi b/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi index b8e0edd1ac69..4da451e000d9 100644 --- a/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/bsc9132si-post.dtsi @@ -35,7 +35,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; /* FIXME: Test whether interrupts are split */ interrupts = <16 2 0 0 20 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/c293si-post.dtsi b/arch/powerpc/boot/dts/fsl/c293si-post.dtsi index f208fb8f64b3..2d443d519274 100644 --- a/arch/powerpc/boot/dts/fsl/c293si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/c293si-post.dtsi @@ -35,7 +35,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <19 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi b/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi index b540e58ff79e..2d2550729dcc 100644 --- a/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010si-post.dtsi @@ -35,7 +35,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <16 2 0 0 19 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi index aa5152ca8120..8ef0c020206b 100644 --- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi @@ -52,7 +52,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <25 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi index 776788623204..c9542b73bd7f 100644 --- a/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t1040si-post.dtsi @@ -52,7 +52,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <25 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi index 27714dc2f04a..6bb95878d39d 100644 --- a/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t2081si-post.dtsi @@ -50,7 +50,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <25 2 0 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi index fcac73486d48..65f3e17c0d41 100644 --- a/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi +++ b/arch/powerpc/boot/dts/fsl/t4240si-post.dtsi @@ -50,7 +50,7 @@ &ifc { #address-cells = <2>; #size-cells = <1>; - compatible = "fsl,ifc", "simple-bus"; + compatible = "fsl,ifc"; interrupts = <25 2 0 0>; }; From acb354fe97e5aa6d9534b601ce18ef7866f25c4d Mon Sep 17 00:00:00 2001 From: Li Yang Date: Fri, 19 Jan 2024 15:38:57 -0500 Subject: [PATCH 61/85] powerpc: dts: fsl: rename ifc node name to be memory-controller Update the node name to be align with binding document. Signed-off-by: Li Yang Signed-off-by: Frank Li Signed-off-by: Michael Ellerman Link: https://msgid.link/20240119203911.3143928-4-Frank.Li@nxp.com --- arch/powerpc/boot/dts/fsl/bsc9131rdb.dts | 2 +- arch/powerpc/boot/dts/fsl/bsc9132qds.dts | 2 +- arch/powerpc/boot/dts/fsl/c293pcie.dts | 2 +- arch/powerpc/boot/dts/fsl/p1010rdb_32b.dtsi | 2 +- arch/powerpc/boot/dts/fsl/p1010rdb_36b.dtsi | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/bsc9131rdb.dts b/arch/powerpc/boot/dts/fsl/bsc9131rdb.dts index 8da984251abc..0ba86a6dce1b 100644 --- a/arch/powerpc/boot/dts/fsl/bsc9131rdb.dts +++ b/arch/powerpc/boot/dts/fsl/bsc9131rdb.dts @@ -15,7 +15,7 @@ device_type = "memory"; }; - board_ifc: ifc: ifc@ff71e000 { + board_ifc: ifc: memory-controller@ff71e000 { /* NAND Flash on board */ ranges = <0x0 0x0 0x0 0xff800000 0x00004000>; reg = <0x0 0xff71e000 0x0 0x2000>; diff --git a/arch/powerpc/boot/dts/fsl/bsc9132qds.dts b/arch/powerpc/boot/dts/fsl/bsc9132qds.dts index 7cb2158dfe58..ce642e879a1b 100644 --- a/arch/powerpc/boot/dts/fsl/bsc9132qds.dts +++ b/arch/powerpc/boot/dts/fsl/bsc9132qds.dts @@ -15,7 +15,7 @@ device_type = "memory"; }; - ifc: ifc@ff71e000 { + ifc: memory-controller@ff71e000 { /* NOR, NAND Flash on board */ ranges = <0x0 0x0 0x0 0x88000000 0x08000000 0x1 0x0 0x0 0xff800000 0x00010000>; diff --git a/arch/powerpc/boot/dts/fsl/c293pcie.dts b/arch/powerpc/boot/dts/fsl/c293pcie.dts index 5e905e0857cf..e2fdac2ed420 100644 --- a/arch/powerpc/boot/dts/fsl/c293pcie.dts +++ b/arch/powerpc/boot/dts/fsl/c293pcie.dts @@ -42,7 +42,7 @@ device_type = "memory"; }; - ifc: ifc@fffe1e000 { + ifc: memory-controller@fffe1e000 { reg = <0xf 0xffe1e000 0 0x2000>; ranges = <0x0 0x0 0xf 0xec000000 0x04000000 0x1 0x0 0xf 0xff800000 0x00010000 diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb_32b.dtsi b/arch/powerpc/boot/dts/fsl/p1010rdb_32b.dtsi index fdc19aab2f70..583a6cd05079 100644 --- a/arch/powerpc/boot/dts/fsl/p1010rdb_32b.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010rdb_32b.dtsi @@ -36,7 +36,7 @@ memory { device_type = "memory"; }; -board_ifc: ifc: ifc@ffe1e000 { +board_ifc: ifc: memory-controller@ffe1e000 { /* NOR, NAND Flashes and CPLD on board */ ranges = <0x0 0x0 0x0 0xee000000 0x02000000 0x1 0x0 0x0 0xff800000 0x00010000 diff --git a/arch/powerpc/boot/dts/fsl/p1010rdb_36b.dtsi b/arch/powerpc/boot/dts/fsl/p1010rdb_36b.dtsi index de2fceed4f79..4d41efe0038f 100644 --- a/arch/powerpc/boot/dts/fsl/p1010rdb_36b.dtsi +++ b/arch/powerpc/boot/dts/fsl/p1010rdb_36b.dtsi @@ -36,7 +36,7 @@ memory { device_type = "memory"; }; -board_ifc: ifc: ifc@fffe1e000 { +board_ifc: ifc: memory-controller@fffe1e000 { /* NOR, NAND Flashes and CPLD on board */ ranges = <0x0 0x0 0xf 0xee000000 0x02000000 0x1 0x0 0xf 0xff800000 0x00010000 From 473e2311f31fdcae8e3f4410d119dbfece656edc Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Sat, 13 Jan 2024 08:05:09 +0000 Subject: [PATCH 62/85] powerpc: Fix preserved memory size for int-vectors The first 32k of memory is reserved for interrupt vectors, however for powerpc64 this might not be enough. Fix this by reserving the maximum size between 32k and the real size of interrupt vectors. Signed-off-by: GUO Zihua Signed-off-by: Michael Ellerman Link: https://msgid.link/20240113080509.1598290-1-guozihua@huawei.com --- arch/powerpc/kernel/prom.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index b8f764453eaa..eb140ea6b6ff 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -779,6 +779,7 @@ static inline void save_fscr_to_task(void) {} void __init early_init_devtree(void *params) { + phys_addr_t int_vector_size; DBG(" -> early_init_devtree(%px)\n", params); @@ -831,9 +832,16 @@ void __init early_init_devtree(void *params) setup_initial_memory_limit(memstart_addr, first_memblock_size); /* Reserve MEMBLOCK regions used by kernel, initrd, dt, etc... */ memblock_reserve(PHYSICAL_START, __pa(_end) - PHYSICAL_START); +#ifdef CONFIG_PPC64 + /* If relocatable, reserve at least 32k for interrupt vectors etc. */ + int_vector_size = __end_interrupts - _stext; + int_vector_size = max_t(phys_addr_t, SZ_32K, int_vector_size); +#else /* If relocatable, reserve first 32k for interrupt vectors etc. */ + int_vector_size = SZ_32K; +#endif if (PHYSICAL_START > MEMORY_START) - memblock_reserve(MEMORY_START, 0x8000); + memblock_reserve(MEMORY_START, int_vector_size); reserve_kdump_trampoline(); #if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) /* From f3560a2ba5cbbb6c62c14dbdc1e33cb3565199d0 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 25 Jan 2024 16:26:37 +0800 Subject: [PATCH 63/85] powerpc/iommu: Code cleanup for cell/iommu.c This part was commented from commit 165785e5c0be ("[POWERPC] Cell iommu support") in about 17 years before. If there are no plans to enable this part code in the future, we can remove this dead code. Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240125082637.532826-1-chentao@kylinos.cn --- arch/powerpc/platforms/cell/iommu.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 1202a69b0a20..4cd9c0de22c2 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -424,23 +424,6 @@ static void __init cell_iommu_setup_hardware(struct cbe_iommu *iommu, cell_iommu_enable_hardware(iommu); } -#if 0/* Unused for now */ -static struct iommu_window *find_window(struct cbe_iommu *iommu, - unsigned long offset, unsigned long size) -{ - struct iommu_window *window; - - /* todo: check for overlapping (but not equal) windows) */ - - list_for_each_entry(window, &(iommu->windows), list) { - if (window->offset == offset && window->size == size) - return window; - } - - return NULL; -} -#endif - static inline u32 cell_iommu_get_ioid(struct device_node *np) { const u32 *ioid; From 66d8e646e8e78ea6088d9f6b9465e211566b5133 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 26 Jan 2024 10:12:58 +0800 Subject: [PATCH 64/85] powerpc/cell: Code cleanup for spufs_mfc_flush This part was commented from commit a33a7d7309d7 ("[PATCH] spufs: implement mfc access for PPE-side DMA") in about 18 years before. If there are no plans to enable this part code in the future, we can remove this dead code. Signed-off-by: Kunwu Chan Suggested-by: Christophe Leroy Acked-by: Arnd Bergmann Signed-off-by: Michael Ellerman Link: https://msgid.link/20240126021258.574916-1-chentao@kylinos.cn --- arch/powerpc/platforms/cell/spufs/file.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 02a8158c469d..7f4e0db8eb08 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1704,23 +1704,11 @@ static int spufs_mfc_flush(struct file *file, fl_owner_t id) ret = spu_acquire(ctx); if (ret) - goto out; -#if 0 -/* this currently hangs */ - ret = spufs_wait(ctx->mfc_wq, - ctx->ops->set_mfc_query(ctx, ctx->tagwait, 2)); - if (ret) - goto out; - ret = spufs_wait(ctx->mfc_wq, - ctx->ops->read_mfc_tagstatus(ctx) == ctx->tagwait); - if (ret) - goto out; -#else - ret = 0; -#endif + return ret; + spu_release(ctx); -out: - return ret; + + return 0; } static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int datasync) From 2d8ebee0aac3a45d81de4f44255c8021d5a3401e Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Fri, 26 Jan 2024 10:50:30 +0800 Subject: [PATCH 65/85] powerpc/pseries/pci: Code cleanup This part was commented in about 19 years before. If there are no plans to enable this part code in the future, we can remove this dead code. Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240126025030.577795-1-chentao@kylinos.cn --- arch/powerpc/platforms/pseries/pci.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 1772ae3d193d..6dbc73eb2ca2 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -18,33 +18,6 @@ #include #include "pseries.h" -#if 0 -void pcibios_name_device(struct pci_dev *dev) -{ - struct device_node *dn; - - /* - * Add IBM loc code (slot) as a prefix to the device names for service - */ - dn = pci_device_to_OF_node(dev); - if (dn) { - const char *loc_code = of_get_property(dn, "ibm,loc-code", - NULL); - if (loc_code) { - int loc_len = strlen(loc_code); - if (loc_len < sizeof(dev->dev.name)) { - memmove(dev->dev.name+loc_len+1, dev->dev.name, - sizeof(dev->dev.name)-loc_len-1); - memcpy(dev->dev.name, loc_code, loc_len); - dev->dev.name[loc_len] = ' '; - dev->dev.name[sizeof(dev->dev.name)-1] = '\0'; - } - } - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device); -#endif - #ifdef CONFIG_PCI_IOV #define MAX_VFS_FOR_MAP_PE 256 struct pe_map_bar_entry { From c330b50d8cae1a7b1fed7622eedacaf652396bb7 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Wed, 10 Jan 2024 19:42:37 +0530 Subject: [PATCH 66/85] powerpc/Makefile: Remove bits related to the previous use of -mcmodel=large All supported compilers today (gcc v5.1+ and clang v11+) have support for -mcmodel=medium. As such, NO_MINIMAL_TOC is no longer being set. Remove NO_MINIMAL_TOC as well as the fallback to -mminimal-toc. Reviewed-by: Christophe Leroy Signed-off-by: Naveen N Rao Signed-off-by: Michael Ellerman Link: https://msgid.link/20240110141237.3179199-1-naveen@kernel.org --- arch/powerpc/Makefile | 6 +----- arch/powerpc/kernel/Makefile | 3 --- arch/powerpc/lib/Makefile | 2 -- arch/powerpc/mm/Makefile | 2 -- arch/powerpc/mm/book3s64/Makefile | 2 -- arch/powerpc/mm/nohash/Makefile | 2 -- arch/powerpc/platforms/pseries/Makefile | 1 - arch/powerpc/sysdev/Makefile | 2 -- arch/powerpc/xmon/Makefile | 2 -- 9 files changed, 1 insertion(+), 21 deletions(-) diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 65261cbe5bfd..0a0c57aee1ae 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -114,7 +114,6 @@ LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) ifdef CONFIG_PPC64 ifndef CONFIG_PPC_KERNEL_PCREL -ifeq ($(call cc-option-yn,-mcmodel=medium),y) # -mcmodel=medium breaks modules because it uses 32bit offsets from # the TOC pointer to create pointers where possible. Pointers into the # percpu data area are created by this method. @@ -124,9 +123,6 @@ ifeq ($(call cc-option-yn,-mcmodel=medium),y) # kernel percpu data space (starting with 0xc...). We need a full # 64bit relocation for this to work, hence -mcmodel=large. KBUILD_CFLAGS_MODULE += -mcmodel=large -else - export NO_MINIMAL_TOC := -mno-minimal-toc -endif endif endif @@ -139,7 +135,7 @@ CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mabi=elfv1) CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcall-aixdesc) endif endif -CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mcmodel=medium,$(call cc-option,-mminimal-toc)) +CFLAGS-$(CONFIG_PPC64) += -mcmodel=medium CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mno-pointers-to-nested-functions) CFLAGS-$(CONFIG_PPC64) += $(call cc-option,-mlong-double-128) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f5dd2d65cdbe..8585d03c02d3 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -3,9 +3,6 @@ # Makefile for the linux kernel. # -ifdef CONFIG_PPC64 -CFLAGS_prom_init.o += $(NO_MINIMAL_TOC) -endif ifdef CONFIG_PPC32 CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 0ab65eeb93ee..f14ecab674a3 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -3,8 +3,6 @@ # Makefile for ppc-specific library files.. # -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - CFLAGS_code-patching.o += -fno-stack-protector CFLAGS_feature-fixups.o += -fno-stack-protector diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 503a6e249940..0fe2f085c05a 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -3,8 +3,6 @@ # Makefile for the linux ppc-specific parts of the memory manager. # -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - obj-y := fault.o mem.o pgtable.o maccess.o pageattr.o \ init_$(BITS).o pgtable_$(BITS).o \ pgtable-frag.o ioremap.o ioremap_$(BITS).o \ diff --git a/arch/powerpc/mm/book3s64/Makefile b/arch/powerpc/mm/book3s64/Makefile index cad2abc1730f..33af5795856a 100644 --- a/arch/powerpc/mm/book3s64/Makefile +++ b/arch/powerpc/mm/book3s64/Makefile @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y := $(NO_MINIMAL_TOC) - obj-y += mmu_context.o pgtable.o trace.o ifdef CONFIG_PPC_64S_HASH_MMU CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE) diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile index f3894e79d5f7..b3f0498dd42f 100644 --- a/arch/powerpc/mm/nohash/Makefile +++ b/arch/powerpc/mm/nohash/Makefile @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - obj-y += mmu_context.o tlb.o tlb_low.o kup.o obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o book3e_pgtable.o obj-$(CONFIG_40x) += 40x.o diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index f936962a2946..7bf506f6b8c8 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG obj-y := lpar.o hvCall.o nvram.o reconfig.o \ diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile index 9cb1d029511a..24a177d164f1 100644 --- a/arch/powerpc/sysdev/Makefile +++ b/arch/powerpc/sysdev/Makefile @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - mpic-msi-obj-$(CONFIG_PCI_MSI) += mpic_msi.o mpic_u3msi.o obj-$(CONFIG_MPIC) += mpic.o $(mpic-msi-obj-y) obj-$(CONFIG_MPIC_TIMER) += mpic_timer.o diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile index 682c7c0a6f77..d778011060a8 100644 --- a/arch/powerpc/xmon/Makefile +++ b/arch/powerpc/xmon/Makefile @@ -10,8 +10,6 @@ KCSAN_SANITIZE := n # Disable ftrace for the entire directory ccflags-remove-$(CONFIG_FUNCTION_TRACER) += $(CC_FLAGS_FTRACE) -ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) - # Clang stores addresses on the stack causing the frame size to blow # out. See https://github.com/ClangBuiltLinux/linux/issues/252 ccflags-$(CONFIG_CC_IS_CLANG) += -Wframe-larger-than=4096 From bc8744c6bf0d487dcb7911d093fce60a62cc2654 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 8 May 2024 00:01:50 +1000 Subject: [PATCH 67/85] macintosh/ams: Fix unused variable warning If both CONFIG_SENSORS_AMS_PMU and CONFIG_SENSORS_AMS_I2C are unset, there is an unused variable warning in the ams driver: drivers/macintosh/ams/ams-core.c: In function 'ams_init': drivers/macintosh/ams/ams-core.c:181:29: warning: unused variable 'np' 181 | struct device_node *np; The driver needs at least one of the configs enabled in order to actually function. So fix the compiler warning by ensuring at least one of the configs is enabled. Suggested-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/20240507140150.54630-1-mpe@ellerman.id.au --- drivers/macintosh/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index a0e717a986dc..fb38f684444f 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -262,7 +262,7 @@ config SENSORS_AMS will be called ams. config SENSORS_AMS_PMU - bool "PMU variant" + bool "PMU variant" if SENSORS_AMS_I2C depends on SENSORS_AMS && ADB_PMU default y help From 39434af10f1045b50826b8b506415f36681d4b40 Mon Sep 17 00:00:00 2001 From: Ghanshyam Agrawal Date: Thu, 28 Dec 2023 15:45:18 +0530 Subject: [PATCH 68/85] powerpc/eeh: Fix spelling of the word "auxillary" and update comment Fix spelling of the word "auxillary" in arch/powerpc/kernel/eeh_pe.c and arch/powerpc/include/asm/eeh.h. Also update the eeh_set_pe_aux_size() comment to include the units. Signed-off-by: Ghanshyam Agrawal [mpe: Squash into one commit] Signed-off-by: Michael Ellerman Link: https://msgid.link/2ab034609285b21c309cd8ab26c937c846d37ee7.1703756365.git.ghanshyam1898@gmail.com --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/kernel/eeh_pe.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 514dd056c2c8..91a9fd53254f 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -82,7 +82,7 @@ struct eeh_pe { int false_positives; /* Times of reported #ff's */ atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE */ - void *data; /* PE auxillary data */ + void *data; /* PE auxiliary data */ struct list_head child_list; /* List of PEs below this PE */ struct list_head child; /* Memb. child_list/eeh_phb_pe */ struct list_head edevs; /* List of eeh_dev in this PE */ diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index e0ce81279624..2038454ce864 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -24,10 +24,10 @@ static int eeh_pe_aux_size = 0; static LIST_HEAD(eeh_phb_pe); /** - * eeh_set_pe_aux_size - Set PE auxillary data size - * @size: PE auxillary data size + * eeh_set_pe_aux_size - Set PE auxiliary data size + * @size: PE auxiliary data size in bytes * - * Set PE auxillary data size + * Set PE auxiliary data size. */ void eeh_set_pe_aux_size(int size) { From 0ddbbb8960eaf91c7b432ec80566dfa60a8d79e4 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 3 Jan 2024 17:16:04 -0600 Subject: [PATCH 69/85] powerpc: Fix typos Fix typos, most reported by "codespell arch/powerpc". Only touches comments, no code changes. Signed-off-by: Bjorn Helgaas Signed-off-by: Michael Ellerman Link: https://msgid.link/20240103231605.1801364-8-helgaas@kernel.org --- arch/powerpc/boot/Makefile | 4 ++-- arch/powerpc/boot/dts/acadia.dts | 2 +- arch/powerpc/boot/main.c | 2 +- arch/powerpc/boot/ps3.c | 2 +- arch/powerpc/include/asm/io.h | 2 +- arch/powerpc/include/asm/opal-api.h | 4 ++-- arch/powerpc/include/asm/pmac_feature.h | 2 +- arch/powerpc/include/asm/uninorth.h | 2 +- arch/powerpc/include/uapi/asm/bootx.h | 2 +- arch/powerpc/kernel/eeh_pe.c | 2 +- arch/powerpc/kernel/fadump.c | 2 +- arch/powerpc/kernel/misc_64.S | 4 ++-- arch/powerpc/kernel/process.c | 12 ++++++------ arch/powerpc/kernel/ptrace/ptrace-tm.c | 2 +- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/kernel/sysfs.c | 4 ++-- arch/powerpc/kvm/book3s_xive.c | 2 +- arch/powerpc/mm/cacheflush.c | 2 +- arch/powerpc/mm/nohash/kaslr_booke.c | 2 +- arch/powerpc/platforms/512x/mpc512x_shared.c | 2 +- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- arch/powerpc/platforms/maple/pci.c | 2 +- arch/powerpc/platforms/powermac/pic.c | 2 +- arch/powerpc/platforms/powermac/sleep.S | 2 +- arch/powerpc/platforms/powernv/pci-sriov.c | 4 ++-- arch/powerpc/platforms/powernv/vas-window.c | 2 +- arch/powerpc/platforms/pseries/vas.c | 2 +- arch/powerpc/sysdev/xive/common.c | 4 ++-- arch/powerpc/sysdev/xive/native.c | 2 +- 29 files changed, 40 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 968aee2025b8..9c2b6e527ed1 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -108,8 +108,8 @@ DTC_FLAGS ?= -p 1024 # these files into the build dir, fix up any includes and ensure that dependent # files are copied in the right order. -# these need to be seperate variables because they are copied out of different -# directories in the kernel tree. Sure you COULd merge them, but it's a +# these need to be separate variables because they are copied out of different +# directories in the kernel tree. Sure you COULD merge them, but it's a # cure-is-worse-than-disease situation. zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c diff --git a/arch/powerpc/boot/dts/acadia.dts b/arch/powerpc/boot/dts/acadia.dts index deb52e41ab84..5fedda811378 100644 --- a/arch/powerpc/boot/dts/acadia.dts +++ b/arch/powerpc/boot/dts/acadia.dts @@ -172,7 +172,7 @@ reg = <0xef602800 0x60>; interrupt-parent = <&UIC0>; interrupts = <0x4 0x4>; - /* This thing is a bit weird. It has it's own UIC + /* This thing is a bit weird. It has its own UIC * that it uses to generate snapshot triggers. We * don't really support this device yet, and it needs * work to figure this out. diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c index cae31a6e8f02..2c0e2a1cab01 100644 --- a/arch/powerpc/boot/main.c +++ b/arch/powerpc/boot/main.c @@ -188,7 +188,7 @@ static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { } /* A buffer that may be edited by tools operating on a zImage binary so as to * edit the command line passed to vmlinux (by setting /chosen/bootargs). - * The buffer is put in it's own section so that tools may locate it easier. + * The buffer is put in its own section so that tools may locate it easier. */ static char cmdline[BOOT_COMMAND_LINE_SIZE] __attribute__((__section__("__builtin_cmdline"))); diff --git a/arch/powerpc/boot/ps3.c b/arch/powerpc/boot/ps3.c index f157717ae814..89ff46b8b225 100644 --- a/arch/powerpc/boot/ps3.c +++ b/arch/powerpc/boot/ps3.c @@ -25,7 +25,7 @@ BSS_STACK(4096); /* A buffer that may be edited by tools operating on a zImage binary so as to * edit the command line passed to vmlinux (by setting /chosen/bootargs). - * The buffer is put in it's own section so that tools may locate it easier. + * The buffer is put in its own section so that tools may locate it easier. */ static char cmdline[BOOT_COMMAND_LINE_SIZE] diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 048e3705af20..52e1b1d15ff6 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -982,7 +982,7 @@ static inline phys_addr_t page_to_phys(struct page *page) } /* - * 32 bits still uses virt_to_bus() for it's implementation of DMA + * 32 bits still uses virt_to_bus() for its implementation of DMA * mappings se we have to keep it defined here. We also have some old * drivers (shame shame shame) that use bus_to_virt() and haven't been * fixed yet so I need to define it here. diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index a2bc4b95e703..8c9d4b26bf57 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1027,10 +1027,10 @@ struct opal_i2c_request { * The host will pass on OPAL, a buffer of length OPAL_SYSEPOW_MAX * with individual elements being 16 bits wide to fetch the system * wide EPOW status. Each element in the buffer will contain the - * EPOW status in it's bit representation for a particular EPOW sub + * EPOW status in its bit representation for a particular EPOW sub * class as defined here. So multiple detailed EPOW status bits * specific for any sub class can be represented in a single buffer - * element as it's bit representation. + * element as its bit representation. */ /* System EPOW type */ diff --git a/arch/powerpc/include/asm/pmac_feature.h b/arch/powerpc/include/asm/pmac_feature.h index 2495866f2e97..420e2878ae67 100644 --- a/arch/powerpc/include/asm/pmac_feature.h +++ b/arch/powerpc/include/asm/pmac_feature.h @@ -192,7 +192,7 @@ static inline long pmac_call_feature(int selector, struct device_node* node, /* PMAC_FTR_BMAC_ENABLE (struct device_node* node, 0, int value) * enable/disable the bmac (ethernet) cell of a mac-io ASIC, also drive - * it's reset line + * its reset line */ #define PMAC_FTR_BMAC_ENABLE PMAC_FTR_DEF(6) diff --git a/arch/powerpc/include/asm/uninorth.h b/arch/powerpc/include/asm/uninorth.h index e278299b9b37..6949b5daa37d 100644 --- a/arch/powerpc/include/asm/uninorth.h +++ b/arch/powerpc/include/asm/uninorth.h @@ -144,7 +144,7 @@ #define UNI_N_HWINIT_STATE_SLEEPING 0x01 #define UNI_N_HWINIT_STATE_RUNNING 0x02 /* This last bit appear to be used by the bootROM to know the second - * CPU has started and will enter it's sleep loop with IP=0 + * CPU has started and will enter its sleep loop with IP=0 */ #define UNI_N_HWINIT_STATE_CPU1_FLAG 0x10000000 diff --git a/arch/powerpc/include/uapi/asm/bootx.h b/arch/powerpc/include/uapi/asm/bootx.h index 6728c7e24e58..1b8c121071d9 100644 --- a/arch/powerpc/include/uapi/asm/bootx.h +++ b/arch/powerpc/include/uapi/asm/bootx.h @@ -108,7 +108,7 @@ typedef struct boot_infos /* ALL BELOW NEW (vers. 4) */ /* This defines the physical memory. Valid with BOOT_ARCH_NUBUS flag - (non-PCI) only. On PCI, memory is contiguous and it's size is in the + (non-PCI) only. On PCI, memory is contiguous and its size is in the device-tree. */ boot_info_map_entry_t physMemoryMap[MAX_MEM_MAP_SIZE]; /* Where the phys memory is */ diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 2038454ce864..d1030bc52564 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -527,7 +527,7 @@ EXPORT_SYMBOL_GPL(eeh_pe_state_mark); * eeh_pe_mark_isolated * @pe: EEH PE * - * Record that a PE has been isolated by marking the PE and it's children as + * Record that a PE has been isolated by marking the PE and its children as * EEH_PE_ISOLATED (and EEH_PE_CFG_BLOCKED, if required) and their PCI devices * as pci_channel_io_frozen. */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 2de7379d0f30..0b849563393e 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -681,7 +681,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) * old_cpu == -1 means this is the first CPU which has come here, * go ahead and trigger fadump. * - * old_cpu != -1 means some other CPU has already on it's way + * old_cpu != -1 means some other CPU has already on its way * to trigger fadump, just keep looping here. */ this_cpu = smp_processor_id(); diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 1a8cdafd68e8..91123e102db4 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -192,7 +192,7 @@ _GLOBAL(scom970_read) xori r0,r0,MSR_EE mtmsrd r0,1 - /* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits + /* rotate 24 bits SCOM address 8 bits left and mask out its low 8 bits * (including parity). On current CPUs they must be 0'd, * and finally or in RW bit */ @@ -226,7 +226,7 @@ _GLOBAL(scom970_write) xori r0,r0,MSR_EE mtmsrd r0,1 - /* rotate 24 bits SCOM address 8 bits left and mask out it's low 8 bits + /* rotate 24 bits SCOM address 8 bits left and mask out its low 8 bits * (including parity). On current CPUs they must be 0'd. */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 8ab779a3bdde..a7671786764b 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1661,7 +1661,7 @@ void arch_setup_new_exec(void) * cases will happen: * * 1. The correct thread is running, the wrong thread is not - * In this situation, the correct thread is woken and proceeds to pass it's + * In this situation, the correct thread is woken and proceeds to pass its * condition check. * * 2. Neither threads are running @@ -1671,15 +1671,15 @@ void arch_setup_new_exec(void) * for the wrong thread, or they will execute the condition check immediately. * * 3. The wrong thread is running, the correct thread is not - * The wrong thread will be woken, but will fail it's condition check and + * The wrong thread will be woken, but will fail its condition check and * re-execute wait. The correct thread, when scheduled, will execute either - * it's condition check (which will pass), or wait, which returns immediately - * when called the first time after the thread is scheduled, followed by it's + * its condition check (which will pass), or wait, which returns immediately + * when called the first time after the thread is scheduled, followed by its * condition check (which will pass). * * 4. Both threads are running - * Both threads will be woken. The wrong thread will fail it's condition check - * and execute another wait, while the correct thread will pass it's condition + * Both threads will be woken. The wrong thread will fail its condition check + * and execute another wait, while the correct thread will pass its condition * check. * * @t: the task to set the thread ID for diff --git a/arch/powerpc/kernel/ptrace/ptrace-tm.c b/arch/powerpc/kernel/ptrace/ptrace-tm.c index 210ea834e603..447bff87fd21 100644 --- a/arch/powerpc/kernel/ptrace/ptrace-tm.c +++ b/arch/powerpc/kernel/ptrace/ptrace-tm.c @@ -12,7 +12,7 @@ void flush_tmregs_to_thread(struct task_struct *tsk) { /* * If task is not current, it will have been flushed already to - * it's thread_struct during __switch_to(). + * its thread_struct during __switch_to(). * * A reclaim flushes ALL the state or if not in TM save TM SPRs * in the appropriate thread structures from live. diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 12e53b3d7923..46e6d2cd7a2d 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1567,7 +1567,7 @@ static void add_cpu_to_masks(int cpu) /* * This CPU will not be in the online mask yet so we need to manually - * add it to it's own thread sibling mask. + * add it to its own thread sibling mask. */ map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 0f39a6b84132..b842c83ab497 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -139,7 +139,7 @@ static unsigned long dscr_default; * @val: Returned cpu specific DSCR default value * * This function returns the per cpu DSCR default value - * for any cpu which is contained in it's PACA structure. + * for any cpu which is contained in its PACA structure. */ static void read_dscr(void *val) { @@ -152,7 +152,7 @@ static void read_dscr(void *val) * @val: New cpu specific DSCR default value to update * * This function updates the per cpu DSCR default value - * for any cpu which is contained in it's PACA structure. + * for any cpu which is contained in its PACA structure. */ static void write_dscr(void *val) { diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 29a382249770..1362c672387e 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -531,7 +531,7 @@ static int xive_vm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) xc->cppr = xive_prio_from_guest(new_cppr); /* - * IPIs are synthetized from MFRR and thus don't need + * IPIs are synthesized from MFRR and thus don't need * any special EOI handling. The underlying interrupt * used to signal MFRR changes is EOId when fetched from * the queue. diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c index 15189592da09..7186516eca52 100644 --- a/arch/powerpc/mm/cacheflush.c +++ b/arch/powerpc/mm/cacheflush.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL(flush_icache_range); #ifdef CONFIG_HIGHMEM /** - * flush_dcache_icache_phys() - Flush a page by it's physical address + * flush_dcache_icache_phys() - Flush a page by its physical address * @physaddr: the physical address of the page */ static void flush_dcache_icache_phys(unsigned long physaddr) diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c b/arch/powerpc/mm/nohash/kaslr_booke.c index cdff129abb14..5c8d1bb98b3e 100644 --- a/arch/powerpc/mm/nohash/kaslr_booke.c +++ b/arch/powerpc/mm/nohash/kaslr_booke.c @@ -376,7 +376,7 @@ notrace void __init kaslr_early_init(void *dt_ptr, phys_addr_t size) create_kaslr_tlb_entry(1, tlb_virt, tlb_phys); } - /* Copy the kernel to it's new location and run */ + /* Copy the kernel to its new location and run */ memcpy((void *)kernstart_virt_addr, (void *)_stext, kernel_sz); flush_icache_range(kernstart_virt_addr, kernstart_virt_addr + kernel_sz); diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c b/arch/powerpc/platforms/512x/mpc512x_shared.c index 8f75e9574c27..8c1f3b629fc7 100644 --- a/arch/powerpc/platforms/512x/mpc512x_shared.c +++ b/arch/powerpc/platforms/512x/mpc512x_shared.c @@ -279,7 +279,7 @@ static void __init mpc512x_setup_diu(void) * and so negatively affect boot time. Instead we reserve the * already configured frame buffer area so that it won't be * destroyed. The starting address of the area to reserve and - * also it's length is passed to memblock_reserve(). It will be + * also its length is passed to memblock_reserve(). It will be * freed later on first open of fbdev, when splash image is not * needed any more. */ diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 99bd027a7f7c..610ca8570682 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -868,7 +868,7 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio) } /** - * spu_deactivate - unbind a context from it's physical spu + * spu_deactivate - unbind a context from its physical spu * @ctx: spu context to unbind * * Unbind @ctx from the physical spu it is running on and schedule diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index b911b31717cc..b9ff37c7f6f0 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -595,7 +595,7 @@ void __init maple_pci_init(void) /* Probe root PCI hosts, that is on U3 the AGP host and the * HyperTransport host. That one is actually "kept" around - * and actually added last as it's resource management relies + * and actually added last as its resource management relies * on the AGP resources to have been setup first */ root = of_find_node_by_path("/"); diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 7135ea1d7db6..2202bf77c7a3 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -2,7 +2,7 @@ /* * Support for the interrupt controllers found on Power Macintosh, * currently Apple's "Grand Central" interrupt controller in all - * it's incarnations. OpenPIC support used on newer machines is + * its incarnations. OpenPIC support used on newer machines is * in a separate file * * Copyright (C) 1997 Paul Mackerras (paulus@samba.org) diff --git a/arch/powerpc/platforms/powermac/sleep.S b/arch/powerpc/platforms/powermac/sleep.S index d497a60003d2..822ed70cdcbf 100644 --- a/arch/powerpc/platforms/powermac/sleep.S +++ b/arch/powerpc/platforms/powermac/sleep.S @@ -176,7 +176,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_USE_HIGH_BATS) * memory location containing the PC to resume from * at address 0. * - On Core99, we must store the wakeup vector at - * address 0x80 and eventually it's parameters + * address 0x80 and eventually its parameters * at address 0x84. I've have some trouble with those * parameters however and I no longer use them. */ diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c index 59882da3e742..cc7b1dd54ac6 100644 --- a/arch/powerpc/platforms/powernv/pci-sriov.c +++ b/arch/powerpc/platforms/powernv/pci-sriov.c @@ -238,7 +238,7 @@ void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) } else if (pdev->is_physfn) { /* * For PFs adjust their allocated IOV resources to match what - * the PHB can support using it's M64 BAR table. + * the PHB can support using its M64 BAR table. */ pnv_pci_ioda_fixup_iov_resources(pdev); } @@ -658,7 +658,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) list_add_tail(&pe->list, &phb->ioda.pe_list); mutex_unlock(&phb->ioda.pe_list_mutex); - /* associate this pe to it's pdn */ + /* associate this pe to its pdn */ list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { if (vf_pdn->busno == vf_bus && vf_pdn->devfn == vf_devfn) { diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index b664838008c1..5147df3a18ac 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1059,7 +1059,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } else { /* - * Interrupt hanlder or fault window setup failed. Means + * Interrupt handler or fault window setup failed. Means * NX can not generate fault for page fault. So not * opening for user space tx window. */ diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 71d52a670d95..ba3fb7a7f2ea 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -228,7 +228,7 @@ static irqreturn_t pseries_vas_irq_handler(int irq, void *data) struct pseries_vas_window *txwin = data; /* - * The thread hanlder will process this interrupt if it is + * The thread handler will process this interrupt if it is * already running. */ atomic_inc(&txwin->pending_faults); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index a289cb97c1d7..fa01818c1972 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -383,7 +383,7 @@ static unsigned int xive_get_irq(void) * CPU. * * If we find that there is indeed more in there, we call - * force_external_irq_replay() to make Linux synthetize an + * force_external_irq_replay() to make Linux synthesize an * external interrupt on the next call to local_irq_restore(). */ static void xive_do_queue_eoi(struct xive_cpu *xc) @@ -874,7 +874,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * * This also tells us that it's in flight to a host queue * or has already been fetched but hasn't been EOIed yet - * by the host. This it's potentially using up a host + * by the host. Thus it's potentially using up a host * queue slot. This is important to know because as long * as this is the case, we must not hard-unmask it when * "returning" that interrupt to the host. diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index f1c0fa6ece21..517b963e3e6a 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -415,7 +415,7 @@ static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc) return; } - /* Grab it's CAM value */ + /* Grab its CAM value */ rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL); if (rc) { pr_err("Failed to get pool VP info CPU %d\n", cpu); From ad679719d7020a200c4a10248ebb3bbb374d423d Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Wed, 24 Jan 2024 11:50:31 +0100 Subject: [PATCH 70/85] powerpc: rename SPRN_HID2 define to SPRN_HID2_750FX This register number is hardware-specific, rename it for clarity. FIXME comments are added in a few places where it seems like the wrong register is used. As I can't test this, only the rename is done with no functional change. Signed-off-by: Matthias Schiffer Signed-off-by: Michael Ellerman Link: https://msgid.link/20240124105031.45734-1-matthias.schiffer@ew.tq-group.com --- arch/powerpc/include/asm/reg.h | 2 +- arch/powerpc/kernel/cpu_setup_6xx.S | 4 ++-- arch/powerpc/kvm/book3s_emulate.c | 4 ++-- arch/powerpc/platforms/52xx/lite5200_sleep.S | 6 ++++-- arch/powerpc/platforms/83xx/suspend-asm.S | 6 ++++-- drivers/cpufreq/pmac32-cpufreq.c | 8 ++++---- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index d3d1aea009b4..eed33cb916d0 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -615,7 +615,7 @@ #define HID1_ABE (1<<10) /* 7450 Address Broadcast Enable */ #define HID1_PS (1<<16) /* 750FX PLL selection */ #endif -#define SPRN_HID2 0x3F8 /* Hardware Implementation Register 2 */ +#define SPRN_HID2_750FX 0x3F8 /* IBM 750FX HID2 Register */ #define SPRN_HID2_GEKKO 0x398 /* Gekko HID2 Register */ #define SPRN_HID2_G2_LE 0x3F3 /* G2_LE HID2 Register */ #define HID2_G2_LE_HBE (1<<18) /* High BAT Enable (G2_LE) */ diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index bfd3f442e5eb..ab3ca74e6730 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -401,7 +401,7 @@ _GLOBAL(__save_cpu_setup) andi. r3,r3,0xff00 cmpwi cr0,r3,0x0200 bne 1f - mfspr r4,SPRN_HID2 + mfspr r4,SPRN_HID2_750FX stw r4,CS_HID2(r5) 1: mtcr r7 @@ -496,7 +496,7 @@ _GLOBAL(__restore_cpu_setup) bne 4f lwz r4,CS_HID2(r5) rlwinm r4,r4,0,19,17 - mtspr SPRN_HID2,r4 + mtspr SPRN_HID2_750FX,r4 sync 4: lwz r4,CS_HID1(r5) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 5bbfb2eed127..de126d153328 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -714,7 +714,7 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_HID1: to_book3s(vcpu)->hid[1] = spr_val; break; - case SPRN_HID2: + case SPRN_HID2_750FX: to_book3s(vcpu)->hid[2] = spr_val; break; case SPRN_HID2_GEKKO: @@ -900,7 +900,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_HID1: *spr_val = to_book3s(vcpu)->hid[1]; break; - case SPRN_HID2: + case SPRN_HID2_750FX: case SPRN_HID2_GEKKO: *spr_val = to_book3s(vcpu)->hid[2]; break; diff --git a/arch/powerpc/platforms/52xx/lite5200_sleep.S b/arch/powerpc/platforms/52xx/lite5200_sleep.S index 0b12647e7b42..0ec2522ee4ad 100644 --- a/arch/powerpc/platforms/52xx/lite5200_sleep.S +++ b/arch/powerpc/platforms/52xx/lite5200_sleep.S @@ -203,7 +203,8 @@ lite5200_wakeup: /* HIDs, MSR */ LOAD_SPRN(HID1, 0x19) - LOAD_SPRN(HID2, 0x1a) + /* FIXME: Should this use HID2_G2_LE? */ + LOAD_SPRN(HID2_750FX, 0x1a) /* address translation is tricky (see turn_on_mmu) */ @@ -283,7 +284,8 @@ SYM_FUNC_START_LOCAL(save_regs) SAVE_SPRN(HID0, 0x18) SAVE_SPRN(HID1, 0x19) - SAVE_SPRN(HID2, 0x1a) + /* FIXME: Should this use HID2_G2_LE? */ + SAVE_SPRN(HID2_750FX, 0x1a) mfmsr r10 stw r10, (4*0x1b)(r4) /*SAVE_SPRN(LR, 0x1c) have to save it before the call */ diff --git a/arch/powerpc/platforms/83xx/suspend-asm.S b/arch/powerpc/platforms/83xx/suspend-asm.S index bc6bd4d0ae96..6a62ed6082c9 100644 --- a/arch/powerpc/platforms/83xx/suspend-asm.S +++ b/arch/powerpc/platforms/83xx/suspend-asm.S @@ -68,7 +68,8 @@ _GLOBAL(mpc83xx_enter_deep_sleep) mfspr r5, SPRN_HID0 mfspr r6, SPRN_HID1 - mfspr r7, SPRN_HID2 + /* FIXME: Should this use SPRN_HID2_G2_LE? */ + mfspr r7, SPRN_HID2_750FX stw r5, SS_HID+0(r3) stw r6, SS_HID+4(r3) @@ -396,7 +397,8 @@ mpc83xx_deep_resume: mtspr SPRN_HID0, r5 mtspr SPRN_HID1, r6 - mtspr SPRN_HID2, r7 + /* FIXME: Should this use SPRN_HID2_G2_LE? */ + mtspr SPRN_HID2_750FX, r7 lwz r4, SS_IABR+0(r3) lwz r5, SS_IABR+4(r3) diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index df3567c1e93b..6c9f0888a2a7 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -120,9 +120,9 @@ static int cpu_750fx_cpu_speed(int low_speed) /* tweak L2 for high voltage */ if (has_cpu_l2lve) { - hid2 = mfspr(SPRN_HID2); + hid2 = mfspr(SPRN_HID2_750FX); hid2 &= ~0x2000; - mtspr(SPRN_HID2, hid2); + mtspr(SPRN_HID2_750FX, hid2); } } #ifdef CONFIG_PPC_BOOK3S_32 @@ -131,9 +131,9 @@ static int cpu_750fx_cpu_speed(int low_speed) if (low_speed == 1) { /* tweak L2 for low voltage */ if (has_cpu_l2lve) { - hid2 = mfspr(SPRN_HID2); + hid2 = mfspr(SPRN_HID2_750FX); hid2 |= 0x2000; - mtspr(SPRN_HID2, hid2); + mtspr(SPRN_HID2_750FX, hid2); } /* ramping down, set voltage last */ From 295454eda97b9c5f7a64ac5c2bb827fd15efb623 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Mon, 12 Feb 2024 15:50:20 +1100 Subject: [PATCH 71/85] powerpc64/kasan: Pass virtual addresses to kasan_init_phys_region() The kasan_init_phys_region() function maps shadow pages necessary for the ranges of the linear map backed by physical pages. Currently kasan_init_phys_region() is being passed physical addresses, but kasan_mem_to_shadow() expects virtual addresses. It works right now because the lower bits (12:64) of the kasan_mem_to_shadow() calculation are the same for the real and virtual addresses, so the actual PTE value is the same in the end. But virtual addresses are the intended input, so fix it. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240212045020.70364-1-bgray@linux.ibm.com --- arch/powerpc/mm/kasan/init_book3e_64.c | 2 +- arch/powerpc/mm/kasan/init_book3s_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c index 11519e88dc6b..43c03b84ff32 100644 --- a/arch/powerpc/mm/kasan/init_book3e_64.c +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -112,7 +112,7 @@ void __init kasan_init(void) pte_t zero_pte = pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL_RO); for_each_mem_range(i, &start, &end) - kasan_init_phys_region((void *)start, (void *)end); + kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end)); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) kasan_remove_zero_shadow((void *)VMALLOC_START, VMALLOC_SIZE); diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 9300d641cf9a..3fb5ce4f48f4 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -62,7 +62,7 @@ void __init kasan_init(void) } for_each_mem_range(i, &start, &end) - kasan_init_phys_region((void *)start, (void *)end); + kasan_init_phys_region(phys_to_virt(start), phys_to_virt(end)); for (i = 0; i < PTRS_PER_PTE; i++) __set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page, From c5ef5e35844ad30503c49802b9d6a6c818fca886 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Mon, 25 Mar 2024 16:28:14 +1100 Subject: [PATCH 72/85] powerpc/code-patching: Test patch_instructions() during boot patch_instructions() introduces new behaviour with a couple of variations. Test each case of * a repeated 32-bit instruction, * a repeated 64-bit instruction (ppc64), and * a copied sequence of instructions for both on a single page and when it crosses a page boundary. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240325052815.854044-1-bgray@linux.ibm.com --- arch/powerpc/lib/test-code-patching.c | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c index c44823292f73..f76030087f98 100644 --- a/arch/powerpc/lib/test-code-patching.c +++ b/arch/powerpc/lib/test-code-patching.c @@ -347,6 +347,97 @@ static void __init test_prefixed_patching(void) check(!memcmp(iptr, expected, sizeof(expected))); } +static void __init test_multi_instruction_patching(void) +{ + u32 code[32]; + void *buf; + u32 *addr32; + u64 *addr64; + ppc_inst_t inst64 = ppc_inst_prefix(OP_PREFIX << 26 | 3UL << 24, PPC_RAW_TRAP()); + u32 inst32 = PPC_RAW_NOP(); + + buf = vzalloc(PAGE_SIZE * 8); + check(buf); + if (!buf) + return; + + /* Test single page 32-bit repeated instruction */ + addr32 = buf + PAGE_SIZE; + check(!patch_instructions(addr32 + 1, &inst32, 12, true)); + + check(addr32[0] == 0); + check(addr32[1] == inst32); + check(addr32[2] == inst32); + check(addr32[3] == inst32); + check(addr32[4] == 0); + + /* Test single page 64-bit repeated instruction */ + if (IS_ENABLED(CONFIG_PPC64)) { + check(ppc_inst_prefixed(inst64)); + + addr64 = buf + PAGE_SIZE * 2; + ppc_inst_write(code, inst64); + check(!patch_instructions((u32 *)(addr64 + 1), code, 24, true)); + + check(addr64[0] == 0); + check(ppc_inst_equal(ppc_inst_read((u32 *)&addr64[1]), inst64)); + check(ppc_inst_equal(ppc_inst_read((u32 *)&addr64[2]), inst64)); + check(ppc_inst_equal(ppc_inst_read((u32 *)&addr64[3]), inst64)); + check(addr64[4] == 0); + } + + /* Test single page memcpy */ + addr32 = buf + PAGE_SIZE * 3; + + for (int i = 0; i < ARRAY_SIZE(code); i++) + code[i] = i + 1; + + check(!patch_instructions(addr32 + 1, code, sizeof(code), false)); + + check(addr32[0] == 0); + check(!memcmp(&addr32[1], code, sizeof(code))); + check(addr32[ARRAY_SIZE(code) + 1] == 0); + + /* Test multipage 32-bit repeated instruction */ + addr32 = buf + PAGE_SIZE * 4 - 8; + check(!patch_instructions(addr32 + 1, &inst32, 12, true)); + + check(addr32[0] == 0); + check(addr32[1] == inst32); + check(addr32[2] == inst32); + check(addr32[3] == inst32); + check(addr32[4] == 0); + + /* Test multipage 64-bit repeated instruction */ + if (IS_ENABLED(CONFIG_PPC64)) { + check(ppc_inst_prefixed(inst64)); + + addr64 = buf + PAGE_SIZE * 5 - 8; + ppc_inst_write(code, inst64); + check(!patch_instructions((u32 *)(addr64 + 1), code, 24, true)); + + check(addr64[0] == 0); + check(ppc_inst_equal(ppc_inst_read((u32 *)&addr64[1]), inst64)); + check(ppc_inst_equal(ppc_inst_read((u32 *)&addr64[2]), inst64)); + check(ppc_inst_equal(ppc_inst_read((u32 *)&addr64[3]), inst64)); + check(addr64[4] == 0); + } + + /* Test multipage memcpy */ + addr32 = buf + PAGE_SIZE * 6 - 12; + + for (int i = 0; i < ARRAY_SIZE(code); i++) + code[i] = i + 1; + + check(!patch_instructions(addr32 + 1, code, sizeof(code), false)); + + check(addr32[0] == 0); + check(!memcmp(&addr32[1], code, sizeof(code))); + check(addr32[ARRAY_SIZE(code) + 1] == 0); + + vfree(buf); +} + static int __init test_code_patching(void) { pr_info("Running code patching self-tests ...\n"); @@ -356,6 +447,7 @@ static int __init test_code_patching(void) test_create_function_call(); test_translate_branch(); test_prefixed_patching(); + test_multi_instruction_patching(); return 0; } From c3710ee7cd695dc1b0b4b8cfbf464e313467f970 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Mon, 25 Mar 2024 16:28:15 +1100 Subject: [PATCH 73/85] powerpc/code-patching: Use dedicated memory routines for patching The patching page set up as a writable alias may be in quadrant 0 (userspace) if the temporary mm path is used. This causes sanitiser failures if so. Sanitiser failures also occur on the non-mm path because the plain memset family is instrumented, and KASAN treats the patching window as poisoned. Introduce locally defined patch_* variants of memset that perform an uninstrumented lower level set, as well as detecting write errors like the original single patch variant does. copy_to_user() is not correct here, as the PTE makes it a proper kernel page (the EAA is privileged access only, RW). It just happens to be in quadrant 0 because that's the hardware's mechanism for using the current PID vs PID 0 in translations. Importantly, it's incorrect to allow user page accesses. Now that the patching memsets are used, we also propagate a failure up to the caller as the single patch variant does. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20240325052815.854044-2-bgray@linux.ibm.com --- arch/powerpc/lib/code-patching.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index c6ab46156cda..df64343b9214 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -372,9 +372,32 @@ int patch_instruction(u32 *addr, ppc_inst_t instr) } NOKPROBE_SYMBOL(patch_instruction); +static int patch_memset64(u64 *addr, u64 val, size_t count) +{ + for (u64 *end = addr + count; addr < end; addr++) + __put_kernel_nofault(addr, &val, u64, failed); + + return 0; + +failed: + return -EPERM; +} + +static int patch_memset32(u32 *addr, u32 val, size_t count) +{ + for (u32 *end = addr + count; addr < end; addr++) + __put_kernel_nofault(addr, &val, u32, failed); + + return 0; + +failed: + return -EPERM; +} + static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool repeat_instr) { unsigned long start = (unsigned long)patch_addr; + int err; /* Repeat instruction */ if (repeat_instr) { @@ -383,19 +406,19 @@ static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool rep if (ppc_inst_prefixed(instr)) { u64 val = ppc_inst_as_ulong(instr); - memset64((u64 *)patch_addr, val, len / 8); + err = patch_memset64((u64 *)patch_addr, val, len / 8); } else { u32 val = ppc_inst_val(instr); - memset32(patch_addr, val, len / 4); + err = patch_memset32(patch_addr, val, len / 4); } } else { - memcpy(patch_addr, code, len); + err = copy_to_kernel_nofault(patch_addr, code, len); } smp_wmb(); /* smp write barrier */ flush_icache_range(start, start + len); - return 0; + return err; } /* From 8873aab8646194a4446117bb617cc71bddda2dee Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Tue, 9 Mar 2021 19:11:10 +0100 Subject: [PATCH 74/85] powerpc/xmon: Check cpu id in commands "c#", "dp#" and "dx#" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All these commands end up peeking into the PACA using the user originated cpu id as an index. Check the cpu id is valid in order to prevent xmon to crash. Instead of printing an error, this follows the same behavior as the "lp s #" command : ignore the buggy cpu id parameter and fall back to the #-less version of the command. Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://msgid.link/161531347060.252863.10490063933688958044.stgit@bahia.lan --- arch/powerpc/xmon/xmon.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d79d6633f333..bd4813bad317 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1350,7 +1350,7 @@ static int cpu_cmd(void) } termch = cpu; - if (!scanhex(&cpu)) { + if (!scanhex(&cpu) || cpu >= num_possible_cpus()) { /* print cpus waiting or in xmon */ printf("cpus stopped:"); last_cpu = first_cpu = NR_CPUS; @@ -2772,7 +2772,7 @@ static void dump_pacas(void) termch = c; /* Put c back, it wasn't 'a' */ - if (scanhex(&num)) + if (scanhex(&num) && num < num_possible_cpus()) dump_one_paca(num); else dump_one_paca(xmon_owner); @@ -2845,7 +2845,7 @@ static void dump_xives(void) termch = c; /* Put c back, it wasn't 'a' */ - if (scanhex(&num)) + if (scanhex(&num) && num < num_possible_cpus()) dump_one_xive(num); else dump_one_xive(xmon_owner); From 7be6ce7043b4cf293c8826a48fd9f56931cef2cf Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Mon, 15 Apr 2024 09:27:29 +0530 Subject: [PATCH 75/85] KVM: PPC: Book3S HV nestedv2: Cancel pending DEC exception This reverts commit 180c6b072bf3 ("KVM: PPC: Book3S HV nestedv2: Do not cancel pending decrementer exception") [1] which prevented canceling a pending HDEC exception for nestedv2 KVM guests. It was done to avoid overhead of a H_GUEST_GET_STATE hcall to read the 'DEC expiry TB' register which was higher compared to handling extra decrementer exceptions. However recent benchmarks indicate that overhead of not handling 'DECR' expiry for Nested KVM Guest(L2) is higher and results in much larger exits to Pseries Host(L1) as indicated by the Unixbench-arithoh bench[2] Metric | Current upstream | Revert [1] | Difference % ======================================================================== arithoh-count (10) | 3244831634 | 3403089673 | +04.88% kvm_hv:kvm_guest_exit | 513558 | 152441 | -70.32% probe:kvmppc_gsb_recv | 28060 | 28110 | +00.18% N=1 As indicated by the data above that reverting [1] results in substantial reduction in number of L2->L1 exits with only slight increase in number of H_GUEST_GET_STATE hcalls to read the value of 'DEC expiry TB'. This results in an overall ~4% improvement of arithoh[2] throughput. [1] commit 180c6b072bf3 ("KVM: PPC: Book3S HV nestedv2: Do not cancel pending decrementer exception") [2] https://github.com/kdlucas/byte-unixbench/ Fixes: 180c6b072bf3 ("KVM: PPC: Book3S HV nestedv2: Do not cancel pending decrementer exception") Signed-off-by: Vaibhav Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20240415035731.103097-1-vaibhav@linux.ibm.com --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8e86eb577eb8..692a7c6f5fd9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4857,7 +4857,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, * entering a nested guest in which case the decrementer is now owned * by L2 and the L1 decrementer is provided in hdec_expires */ - if (!kvmhv_is_nestedv2() && kvmppc_core_pending_dec(vcpu) && + if (kvmppc_core_pending_dec(vcpu) && ((tb < kvmppc_dec_expires_host_tb(vcpu)) || (trap == BOOK3S_INTERRUPT_SYSCALL && kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED))) From a9c08bcd3179a59998d6339505d0010b82cbcb93 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 25 Jan 2024 16:33:48 +0800 Subject: [PATCH 76/85] KVM: PPC: code cleanup for kvmppc_book3s_irqprio_deliver This part was commented from commit 2f4cf5e42d13 ("Add book3s.c") in about 14 years before. If there are no plans to enable this part code in the future, we can remove this dead code. Signed-off-by: Kunwu Chan Signed-off-by: Michael Ellerman Link: https://msgid.link/20240125083348.533883-1-chentao@kylinos.cn --- arch/powerpc/kvm/book3s.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 8acec144120e..be9fbfbf62f7 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -360,10 +360,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, break; } -#if 0 - printk(KERN_INFO "Deliver interrupt 0x%x? %x\n", vec, deliver); -#endif - if (deliver) kvmppc_inject_interrupt(vcpu, vec, 0); From 651d61bc8b7d8bb622cfc24be2ee92eebb4ed3cc Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 11 Apr 2023 15:44:46 +0930 Subject: [PATCH 77/85] KVM: PPC: Fix documentation for ppc mmu caps The documentation mentions KVM_CAP_PPC_RADIX_MMU, but the defines in the kvm headers spell it KVM_CAP_PPC_MMU_RADIX. Similarly with KVM_CAP_PPC_MMU_HASH_V3. Fixes: c92701322711 ("KVM: PPC: Book3S HV: Add userspace interfaces for POWER9 MMU") Signed-off-by: Joel Stanley Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://msgid.link/20230411061446.26324-1-joel@jms.id.au --- Documentation/virt/kvm/api.rst | 8 ++++---- include/uapi/linux/kvm.h | 4 ++-- tools/include/uapi/linux/kvm.h | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0b5a33ee71ee..636e6794828f 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4300,7 +4300,7 @@ operating system that uses the PIT for timing (e.g. Linux 2.4.x). 4.100 KVM_PPC_CONFIGURE_V3_MMU ------------------------------ -:Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +:Capability: KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 :Architectures: ppc :Type: vm ioctl :Parameters: struct kvm_ppc_mmuv3_cfg (in) @@ -4334,7 +4334,7 @@ the Power ISA V3.00, Book III section 5.7.6.1. 4.101 KVM_PPC_GET_RMMU_INFO --------------------------- -:Capability: KVM_CAP_PPC_RADIX_MMU +:Capability: KVM_CAP_PPC_MMU_RADIX :Architectures: ppc :Type: vm ioctl :Parameters: struct kvm_ppc_rmmu_info (out) @@ -8095,7 +8095,7 @@ capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. -8.3 KVM_CAP_PPC_RADIX_MMU +8.3 KVM_CAP_PPC_MMU_RADIX ------------------------- :Architectures: ppc @@ -8105,7 +8105,7 @@ available, means that the kernel can support guests using the radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 processor). -8.4 KVM_CAP_PPC_HASH_MMU_V3 +8.4 KVM_CAP_PPC_MMU_HASH_V3 --------------------------- :Architectures: ppc diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2190adbe3002..d03842abae57 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1221,9 +1221,9 @@ struct kvm_vfio_spapr_tce { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_MMU_RADIX or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) -/* Available with KVM_CAP_PPC_RADIX_MMU */ +/* Available with KVM_CAP_PPC_MMU_RADIX */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* Available with KVM_CAP_PPC_GET_CPU_CHAR */ #define KVM_PPC_GET_CPU_CHAR _IOR(KVMIO, 0xb1, struct kvm_ppc_cpu_char) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index c3308536482b..4b6b635b8bd1 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1502,7 +1502,7 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_SPAPR_RESIZE_HPT */ #define KVM_PPC_RESIZE_HPT_PREPARE _IOR(KVMIO, 0xad, struct kvm_ppc_resize_hpt) #define KVM_PPC_RESIZE_HPT_COMMIT _IOR(KVMIO, 0xae, struct kvm_ppc_resize_hpt) -/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_MMU_HASH_V3 */ #define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) /* Available with KVM_CAP_PPC_RADIX_MMU */ #define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) From b52e8cd3f835869370f8540f1bc804a47a47f02b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 28 Jan 2024 12:34:25 +0100 Subject: [PATCH 78/85] KVM: PPC: Book3S HV nestedv2: Fix an error handling path in gs_msg_ops_kvmhv_nestedv2_config_fill_info() The return value of kvmppc_gse_put_buff_info() is not assigned to 'rc' and 'rc' is uninitialized at this point. So the error handling can not work. Assign the expected value to 'rc' to fix the issue. Fixes: 19d31c5f1157 ("KVM: PPC: Add support for nestedv2 guests") Signed-off-by: Christophe JAILLET Reviewed-by: Vaibhav Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/a7ed4cc12e0a0bbd97fac44fe6c222d1c393ec95.1706441651.git.christophe.jaillet@wanadoo.fr --- arch/powerpc/kvm/book3s_hv_nestedv2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_nestedv2.c b/arch/powerpc/kvm/book3s_hv_nestedv2.c index 8e6f5355f08b..1091f7a83b25 100644 --- a/arch/powerpc/kvm/book3s_hv_nestedv2.c +++ b/arch/powerpc/kvm/book3s_hv_nestedv2.c @@ -71,8 +71,8 @@ gs_msg_ops_kvmhv_nestedv2_config_fill_info(struct kvmppc_gs_buff *gsb, } if (kvmppc_gsm_includes(gsm, KVMPPC_GSID_RUN_OUTPUT)) { - kvmppc_gse_put_buff_info(gsb, KVMPPC_GSID_RUN_OUTPUT, - cfg->vcpu_run_output_cfg); + rc = kvmppc_gse_put_buff_info(gsb, KVMPPC_GSID_RUN_OUTPUT, + cfg->vcpu_run_output_cfg); if (rc < 0) return rc; } From 98ec6d38ee57a734123c6f5d42640804034024ef Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 8 May 2024 09:41:17 +0100 Subject: [PATCH 79/85] selftests/powerpc/dexcr: Fix spelling mistake "predicition" -> "prediction" There is a spelling mistake in the help message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Michael Ellerman Link: https://msgid.link/20240508084117.2869261-1-colin.i.king@gmail.com --- tools/testing/selftests/powerpc/dexcr/chdexcr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/powerpc/dexcr/chdexcr.c b/tools/testing/selftests/powerpc/dexcr/chdexcr.c index bda44630cada..c548d7a5bb9b 100644 --- a/tools/testing/selftests/powerpc/dexcr/chdexcr.c +++ b/tools/testing/selftests/powerpc/dexcr/chdexcr.c @@ -26,7 +26,7 @@ static void help(void) "\n" "The normal option sets the aspect in the DEXCR. The --no- variant\n" "clears that aspect. For example, --ibrtpd sets the IBRTPD aspect bit,\n" - "so indirect branch predicition will be disabled in the provided program.\n" + "so indirect branch prediction will be disabled in the provided program.\n" "Conversely, --no-ibrtpd clears the aspect bit, so indirect branch\n" "prediction may occur.\n" "\n" From 78d5cc15fb7d1b2683f0baf418a9a870c02319fb Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 9 May 2024 17:27:53 +0530 Subject: [PATCH 80/85] powerpc/pseries/fadump: add support for multiple boot memory regions Currently, fadump on pseries assumes a single boot memory region even though f/w supports more than one boot memory region. Add support for more boot memory regions to make the implementation flexible for any enhancements that introduce other region types. For this, rtas memory structure for fadump is updated to have multiple boot memory regions instead of just one. Additionally, methods responsible for creating the fadump memory structure during both the first and second kernel boot have been modified to take these multiple boot memory regions into account. Also, a new callback has been added to the fadump_ops structure to get the maximum boot memory regions supported by the platform. Signed-off-by: Sourabh Jain Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240509115755.519982-2-hbathini@linux.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 2 +- arch/powerpc/kernel/fadump.c | 27 +- arch/powerpc/platforms/powernv/opal-fadump.c | 7 + arch/powerpc/platforms/pseries/rtas-fadump.c | 251 +++++++++++++------ arch/powerpc/platforms/pseries/rtas-fadump.h | 26 +- 5 files changed, 195 insertions(+), 118 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 5d706a7acc8a..35787fa1ac60 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -156,6 +156,7 @@ struct fadump_ops { struct seq_file *m); void (*fadump_trigger)(struct fadump_crash_info_header *fdh, const char *msg); + int (*fadump_max_boot_mem_rgns)(void); }; /* Helper functions */ @@ -163,7 +164,6 @@ s32 __init fadump_setup_cpu_notes_buf(u32 num_cpus); void fadump_free_cpu_notes_buf(void); u32 *__init fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs); void __init fadump_update_elfcore_header(char *bufp); -bool is_fadump_boot_mem_contiguous(void); bool is_fadump_reserved_mem_contiguous(void); #else /* !CONFIG_PRESERVE_FA_DUMP */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 0b849563393e..fe6be00451b9 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -220,28 +220,6 @@ static bool is_fadump_mem_area_contiguous(u64 d_start, u64 d_end) return ret; } -/* - * Returns true, if there are no holes in boot memory area, - * false otherwise. - */ -bool is_fadump_boot_mem_contiguous(void) -{ - unsigned long d_start, d_end; - bool ret = false; - int i; - - for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { - d_start = fw_dump.boot_mem_addr[i]; - d_end = d_start + fw_dump.boot_mem_sz[i]; - - ret = is_fadump_mem_area_contiguous(d_start, d_end); - if (!ret) - break; - } - - return ret; -} - /* * Returns true, if there are no holes in reserved memory area, * false otherwise. @@ -381,10 +359,11 @@ static unsigned long __init get_fadump_area_size(void) static int __init add_boot_mem_region(unsigned long rstart, unsigned long rsize) { + int max_boot_mem_rgns = fw_dump.ops->fadump_max_boot_mem_rgns(); int i = fw_dump.boot_mem_regs_cnt++; - if (fw_dump.boot_mem_regs_cnt > FADUMP_MAX_MEM_REGS) { - fw_dump.boot_mem_regs_cnt = FADUMP_MAX_MEM_REGS; + if (fw_dump.boot_mem_regs_cnt > max_boot_mem_rgns) { + fw_dump.boot_mem_regs_cnt = max_boot_mem_rgns; return 0; } diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 767a6b19e42a..5a88d7efb48a 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -599,6 +599,12 @@ static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, pr_emerg("No backend support for MPIPL!\n"); } +/* FADUMP_MAX_MEM_REGS or lower */ +static int opal_fadump_max_boot_mem_rgns(void) +{ + return FADUMP_MAX_MEM_REGS; +} + static struct fadump_ops opal_fadump_ops = { .fadump_init_mem_struct = opal_fadump_init_mem_struct, .fadump_get_metadata_size = opal_fadump_get_metadata_size, @@ -611,6 +617,7 @@ static struct fadump_ops opal_fadump_ops = { .fadump_process = opal_fadump_process, .fadump_region_show = opal_fadump_region_show, .fadump_trigger = opal_fadump_trigger, + .fadump_max_boot_mem_rgns = opal_fadump_max_boot_mem_rgns, }; void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 214f37788b2d..4db78b2bb2a8 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -29,9 +29,6 @@ static const struct rtas_fadump_mem_struct *fdm_active; static void rtas_fadump_update_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) { - fadump_conf->boot_mem_dest_addr = - be64_to_cpu(fdm->rmr_region.destination_address); - fadump_conf->fadumphdr_addr = (fadump_conf->boot_mem_dest_addr + fadump_conf->boot_memory_size); } @@ -43,20 +40,53 @@ static void rtas_fadump_update_config(struct fw_dump *fadump_conf, static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf, const struct rtas_fadump_mem_struct *fdm) { - fadump_conf->boot_mem_addr[0] = - be64_to_cpu(fdm->rmr_region.source_address); - fadump_conf->boot_mem_sz[0] = be64_to_cpu(fdm->rmr_region.source_len); - fadump_conf->boot_memory_size = fadump_conf->boot_mem_sz[0]; + unsigned long base, size, last_end, hole_size; - fadump_conf->boot_mem_top = fadump_conf->boot_memory_size; - fadump_conf->boot_mem_regs_cnt = 1; + last_end = 0; + hole_size = 0; + fadump_conf->boot_memory_size = 0; + fadump_conf->boot_mem_regs_cnt = 0; + pr_debug("Boot memory regions:\n"); + for (int i = 0; i < be16_to_cpu(fdm->header.dump_num_sections); i++) { + int type = be16_to_cpu(fdm->rgn[i].source_data_type); + u64 addr; - /* - * Start address of reserve dump area (permanent reservation) for - * re-registering FADump after dump capture. - */ - fadump_conf->reserve_dump_area_start = - be64_to_cpu(fdm->cpu_state_data.destination_address); + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + addr = be64_to_cpu(fdm->rgn[i].destination_address); + + fadump_conf->cpu_state_dest_vaddr = (u64)__va(addr); + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = addr; + break; + case RTAS_FADUMP_HPTE_REGION: + /* Not processed currently. */ + break; + case RTAS_FADUMP_REAL_MODE_REGION: + base = be64_to_cpu(fdm->rgn[i].source_address); + size = be64_to_cpu(fdm->rgn[i].source_len); + pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size); + if (!base) { + fadump_conf->boot_mem_dest_addr = + be64_to_cpu(fdm->rgn[i].destination_address); + } + + fadump_conf->boot_mem_addr[fadump_conf->boot_mem_regs_cnt] = base; + fadump_conf->boot_mem_sz[fadump_conf->boot_mem_regs_cnt] = size; + fadump_conf->boot_memory_size += size; + hole_size += (base - last_end); + last_end = base + size; + fadump_conf->boot_mem_regs_cnt++; + break; + default: + pr_warn("Section type %d unsupported on this kernel. Ignoring!\n", type); + break; + } + } + fadump_conf->boot_mem_top = fadump_conf->boot_memory_size + hole_size; rtas_fadump_update_config(fadump_conf, fdm); } @@ -64,16 +94,15 @@ static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf, static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) { u64 addr = fadump_conf->reserve_dump_area_start; + u16 sec_cnt = 0; memset(&fdm, 0, sizeof(struct rtas_fadump_mem_struct)); addr = addr & PAGE_MASK; fdm.header.dump_format_version = cpu_to_be32(0x00000001); - fdm.header.dump_num_sections = cpu_to_be16(3); fdm.header.dump_status_flag = 0; fdm.header.offset_first_dump_section = - cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, - cpu_state_data)); + cpu_to_be32((u32)offsetof(struct rtas_fadump_mem_struct, rgn)); /* * Fields for disk dump option. @@ -89,25 +118,22 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) /* Kernel dump sections */ /* cpu state data section. */ - fdm.cpu_state_data.request_flag = - cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm.cpu_state_data.source_data_type = - cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); - fdm.cpu_state_data.source_address = 0; - fdm.cpu_state_data.source_len = - cpu_to_be64(fadump_conf->cpu_state_data_size); - fdm.cpu_state_data.destination_address = cpu_to_be64(addr); + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_CPU_STATE_DATA); + fdm.rgn[sec_cnt].source_address = 0; + fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->cpu_state_data_size); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr); addr += fadump_conf->cpu_state_data_size; + sec_cnt++; /* hpte region section */ - fdm.hpte_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm.hpte_region.source_data_type = - cpu_to_be16(RTAS_FADUMP_HPTE_REGION); - fdm.hpte_region.source_address = 0; - fdm.hpte_region.source_len = - cpu_to_be64(fadump_conf->hpte_region_size); - fdm.hpte_region.destination_address = cpu_to_be64(addr); + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_HPTE_REGION); + fdm.rgn[sec_cnt].source_address = 0; + fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->hpte_region_size); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr); addr += fadump_conf->hpte_region_size; + sec_cnt++; /* * Align boot memory area destination address to page boundary to @@ -115,15 +141,20 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) */ addr = PAGE_ALIGN(addr); - /* RMA region section */ - fdm.rmr_region.request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); - fdm.rmr_region.source_data_type = - cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); - fdm.rmr_region.source_address = cpu_to_be64(0); - fdm.rmr_region.source_len = cpu_to_be64(fadump_conf->boot_memory_size); - fdm.rmr_region.destination_address = cpu_to_be64(addr); - addr += fadump_conf->boot_memory_size; + /* First boot memory region destination address */ + fadump_conf->boot_mem_dest_addr = addr; + for (int i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) { + /* Boot memory regions */ + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_REAL_MODE_REGION); + fdm.rgn[sec_cnt].source_address = cpu_to_be64(fadump_conf->boot_mem_addr[i]); + fdm.rgn[sec_cnt].source_len = cpu_to_be64(fadump_conf->boot_mem_sz[i]); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(addr); + addr += fadump_conf->boot_mem_sz[i]; + sec_cnt++; + } + fdm.header.dump_num_sections = cpu_to_be16(sec_cnt); rtas_fadump_update_config(fadump_conf, &fdm); return addr; @@ -136,14 +167,21 @@ static u64 rtas_fadump_get_bootmem_min(void) static int rtas_fadump_register(struct fw_dump *fadump_conf) { - unsigned int wait_time; + unsigned int wait_time, fdm_size; int rc, err = -EIO; + /* + * Platform requires the exact size of the Dump Memory Structure. + * Avoid including any unused rgns in the calculation, as this + * could result in a parameter error (-3) from the platform. + */ + fdm_size = sizeof(struct rtas_fadump_section_header); + fdm_size += be16_to_cpu(fdm.header.dump_num_sections) * sizeof(struct rtas_fadump_section); + /* TODO: Add upper time limit for the delay */ do { rc = rtas_call(fadump_conf->ibm_configure_kernel_dump, 3, 1, - NULL, FADUMP_REGISTER, &fdm, - sizeof(struct rtas_fadump_mem_struct)); + NULL, FADUMP_REGISTER, &fdm, fdm_size); wait_time = rtas_busy_delay_time(rc); if (wait_time) @@ -161,9 +199,7 @@ static int rtas_fadump_register(struct fw_dump *fadump_conf) pr_err("Failed to register. Hardware Error(%d).\n", rc); break; case -3: - if (!is_fadump_boot_mem_contiguous()) - pr_err("Can't have holes in boot memory area.\n"); - else if (!is_fadump_reserved_mem_contiguous()) + if (!is_fadump_reserved_mem_contiguous()) pr_err("Can't have holes in reserved memory area.\n"); pr_err("Failed to register. Parameter Error(%d).\n", rc); @@ -316,11 +352,9 @@ static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) u32 num_cpus, *note_buf; int i, rc = 0, cpu = 0; struct pt_regs regs; - unsigned long addr; void *vaddr; - addr = be64_to_cpu(fdm_active->cpu_state_data.destination_address); - vaddr = __va(addr); + vaddr = (void *)fadump_conf->cpu_state_dest_vaddr; reg_header = vaddr; if (be64_to_cpu(reg_header->magic_number) != @@ -395,18 +429,42 @@ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) return -EINVAL; /* Check if the dump data is valid. */ - if ((be16_to_cpu(fdm_active->header.dump_status_flag) == - RTAS_FADUMP_ERROR_FLAG) || - (fdm_active->cpu_state_data.error_flags != 0) || - (fdm_active->rmr_region.error_flags != 0)) { - pr_err("Dump taken by platform is not valid\n"); - return -EINVAL; - } - if ((fdm_active->rmr_region.bytes_dumped != - fdm_active->rmr_region.source_len) || - !fdm_active->cpu_state_data.bytes_dumped) { - pr_err("Dump taken by platform is incomplete\n"); - return -EINVAL; + for (int i = 0; i < be16_to_cpu(fdm_active->header.dump_num_sections); i++) { + int type = be16_to_cpu(fdm_active->rgn[i].source_data_type); + int rc = 0; + + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + case RTAS_FADUMP_HPTE_REGION: + case RTAS_FADUMP_REAL_MODE_REGION: + if (fdm_active->rgn[i].error_flags != 0) { + pr_err("Dump taken by platform is not valid (%d)\n", i); + rc = -EINVAL; + } + if (fdm_active->rgn[i].bytes_dumped != fdm_active->rgn[i].source_len) { + pr_err("Dump taken by platform is incomplete (%d)\n", i); + rc = -EINVAL; + } + if (rc) { + pr_warn("Region type: %u src addr: 0x%llx dest addr: 0x%llx\n", + be16_to_cpu(fdm_active->rgn[i].source_data_type), + be64_to_cpu(fdm_active->rgn[i].source_address), + be64_to_cpu(fdm_active->rgn[i].destination_address)); + return rc; + } + break; + default: + /* + * If the first/crashed kernel added a new region type that the + * second/fadump kernel doesn't recognize, skip it and process + * assuming backward compatibility. + */ + pr_warn("Unknown region found: type: %u src addr: 0x%llx dest addr: 0x%llx\n", + be16_to_cpu(fdm_active->rgn[i].source_data_type), + be64_to_cpu(fdm_active->rgn[i].source_address), + be64_to_cpu(fdm_active->rgn[i].destination_address)); + break; + } } return rtas_fadump_build_cpu_notes(fadump_conf); @@ -415,7 +473,6 @@ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) static void rtas_fadump_region_show(struct fw_dump *fadump_conf, struct seq_file *m) { - const struct rtas_fadump_section *cpu_data_section; const struct rtas_fadump_mem_struct *fdm_ptr; if (fdm_active) @@ -423,27 +480,42 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf, else fdm_ptr = &fdm; - cpu_data_section = &(fdm_ptr->cpu_state_data); - seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", - be64_to_cpu(cpu_data_section->destination_address), - be64_to_cpu(cpu_data_section->destination_address) + - be64_to_cpu(cpu_data_section->source_len) - 1, - be64_to_cpu(cpu_data_section->source_len), - be64_to_cpu(cpu_data_section->bytes_dumped)); - seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", - be64_to_cpu(fdm_ptr->hpte_region.destination_address), - be64_to_cpu(fdm_ptr->hpte_region.destination_address) + - be64_to_cpu(fdm_ptr->hpte_region.source_len) - 1, - be64_to_cpu(fdm_ptr->hpte_region.source_len), - be64_to_cpu(fdm_ptr->hpte_region.bytes_dumped)); + for (int i = 0; i < be16_to_cpu(fdm_ptr->header.dump_num_sections); i++) { + int type = be16_to_cpu(fdm_ptr->rgn[i].source_data_type); - seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", - be64_to_cpu(fdm_ptr->rmr_region.source_address), - be64_to_cpu(fdm_ptr->rmr_region.destination_address)); - seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", - be64_to_cpu(fdm_ptr->rmr_region.source_len), - be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped)); + switch (type) { + case RTAS_FADUMP_CPU_STATE_DATA: + seq_printf(m, "CPU :[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->rgn[i].destination_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address) + + be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1, + be64_to_cpu(fdm_ptr->rgn[i].source_len), + be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped)); + break; + case RTAS_FADUMP_HPTE_REGION: + seq_printf(m, "HPTE:[%#016llx-%#016llx] %#llx bytes, Dumped: %#llx\n", + be64_to_cpu(fdm_ptr->rgn[i].destination_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address) + + be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1, + be64_to_cpu(fdm_ptr->rgn[i].source_len), + be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped)); + break; + case RTAS_FADUMP_REAL_MODE_REGION: + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + be64_to_cpu(fdm_ptr->rgn[i].source_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address)); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + be64_to_cpu(fdm_ptr->rgn[i].source_len), + be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped)); + break; + default: + seq_printf(m, "Unknown region type %d : Src: %#016llx, Dest: %#016llx, ", + type, be64_to_cpu(fdm_ptr->rgn[i].source_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address)); + break; + } + } /* Dump is active. Show preserved area start address. */ if (fdm_active) { @@ -459,6 +531,20 @@ static void rtas_fadump_trigger(struct fadump_crash_info_header *fdh, rtas_os_term((char *)msg); } +/* FADUMP_MAX_MEM_REGS or lower */ +static int rtas_fadump_max_boot_mem_rgns(void) +{ + /* + * Version 1 of Kernel Assisted Dump Memory Structure (PAPR) supports 10 sections. + * With one each section taken for CPU state data & HPTE respectively, 8 sections + * can be used for boot memory regions. + * + * If new region(s) is(are) defined, maximum boot memory regions will decrease + * proportionally. + */ + return RTAS_FADUMP_MAX_BOOT_MEM_REGS; +} + static struct fadump_ops rtas_fadump_ops = { .fadump_init_mem_struct = rtas_fadump_init_mem_struct, .fadump_get_bootmem_min = rtas_fadump_get_bootmem_min, @@ -468,6 +554,7 @@ static struct fadump_ops rtas_fadump_ops = { .fadump_process = rtas_fadump_process, .fadump_region_show = rtas_fadump_region_show, .fadump_trigger = rtas_fadump_trigger, + .fadump_max_boot_mem_rgns = rtas_fadump_max_boot_mem_rgns, }; void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h index fd59bd7ca9c3..6740f4981bb8 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.h +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -29,6 +29,15 @@ /* Dump status flag */ #define RTAS_FADUMP_ERROR_FLAG 0x2000 +/* + * The Firmware Assisted Dump Memory structure supports a maximum of 10 sections + * in the dump memory structure. Presently, first two sections are used for + * CPU and HPTE data, while the remaining eight sections can be used for + * boot memory regions. + */ +#define MAX_SECTIONS 10 +#define RTAS_FADUMP_MAX_BOOT_MEM_REGS 8 + /* Kernel Dump section info */ struct rtas_fadump_section { __be32 request_flag; @@ -61,20 +70,15 @@ struct rtas_fadump_section_header { * Firmware Assisted dump memory structure. This structure is required for * registering future kernel dump with power firmware through rtas call. * - * No disk dump option. Hence disk dump path string section is not included. + * In version 1, the platform permits one section header, dump-disk path + * and ten sections. + * + * Note: No disk dump option. Hence disk dump path string section is not + * included. */ struct rtas_fadump_mem_struct { struct rtas_fadump_section_header header; - - /* Kernel dump sections */ - struct rtas_fadump_section cpu_state_data; - struct rtas_fadump_section hpte_region; - - /* - * TODO: Extend multiple boot memory regions support in the kernel - * for this platform. - */ - struct rtas_fadump_section rmr_region; + struct rtas_fadump_section rgn[MAX_SECTIONS]; }; /* From 683eab94da75bcf55a9c65e0c31d0529edebe86d Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 9 May 2024 17:27:54 +0530 Subject: [PATCH 81/85] powerpc/fadump: setup additional parameters for dump capture kernel For fadump case, passing additional parameters to dump capture kernel helps in minimizing the memory footprint for it and also provides the flexibility to disable components/modules, like hugepages, that are hindering the boot process of the special dump capture environment. Set up a dedicated parameter area to be passed to the capture kernel. This area type is defined as RTAS_FADUMP_PARAM_AREA. Sysfs attribute '/sys/kernel/fadump/bootargs_append' is exported to the userspace to specify the additional parameters to be passed to the capture kernel Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240509115755.519982-3-hbathini@linux.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 3 + arch/powerpc/kernel/fadump.c | 87 ++++++++++++++++++++ arch/powerpc/platforms/powernv/opal-fadump.c | 6 +- arch/powerpc/platforms/pseries/rtas-fadump.c | 35 +++++++- arch/powerpc/platforms/pseries/rtas-fadump.h | 11 ++- 5 files changed, 133 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 35787fa1ac60..e83869a4eb6a 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -124,6 +124,8 @@ struct fw_dump { unsigned long cpu_notes_buf_vaddr; unsigned long cpu_notes_buf_size; + unsigned long param_area; + /* * Maximum size supported by firmware to copy from source to * destination address per entry. @@ -138,6 +140,7 @@ struct fw_dump { unsigned long dump_active:1; unsigned long dump_registered:1; unsigned long nocma:1; + unsigned long param_area_supported:1; struct fadump_ops *ops; }; diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index fe6be00451b9..6d35b09d6f3a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1431,6 +1431,43 @@ static ssize_t registered_show(struct kobject *kobj, return sprintf(buf, "%d\n", fw_dump.dump_registered); } +static ssize_t bootargs_append_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", (char *)__va(fw_dump.param_area)); +} + +static ssize_t bootargs_append_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + char *params; + + if (!fw_dump.fadump_enabled || fw_dump.dump_active) + return -EPERM; + + if (count >= COMMAND_LINE_SIZE) + return -EINVAL; + + /* + * Fail here instead of handling this scenario with + * some silly workaround in capture kernel. + */ + if (saved_command_line_len + count >= COMMAND_LINE_SIZE) { + pr_err("Appending parameters exceeds cmdline size!\n"); + return -ENOSPC; + } + + params = __va(fw_dump.param_area); + strscpy_pad(params, buf, COMMAND_LINE_SIZE); + /* Remove newline character at the end. */ + if (params[count-1] == '\n') + params[count-1] = '\0'; + + return count; +} + static ssize_t registered_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -1490,6 +1527,7 @@ static struct kobj_attribute enable_attr = __ATTR_RO(enabled); static struct kobj_attribute register_attr = __ATTR_RW(registered); static struct kobj_attribute mem_reserved_attr = __ATTR_RO(mem_reserved); static struct kobj_attribute hotplug_ready_attr = __ATTR_RO(hotplug_ready); +static struct kobj_attribute bootargs_append_attr = __ATTR_RW(bootargs_append); static struct attribute *fadump_attrs[] = { &enable_attr.attr, @@ -1663,6 +1701,54 @@ err_out: fadump_invalidate_release_mem(); } +/* + * Reserve memory to store additional parameters to be passed + * for fadump/capture kernel. + */ +static void fadump_setup_param_area(void) +{ + phys_addr_t range_start, range_end; + + if (!fw_dump.param_area_supported || fw_dump.dump_active) + return; + + /* This memory can't be used by PFW or bootloader as it is shared across kernels */ + if (radix_enabled()) { + /* + * Anywhere in the upper half should be good enough as all memory + * is accessible in real mode. + */ + range_start = memblock_end_of_DRAM() / 2; + range_end = memblock_end_of_DRAM(); + } else { + /* + * Passing additional parameters is supported for hash MMU only + * if the first memory block size is 768MB or higher. + */ + if (ppc64_rma_size < 0x30000000) + return; + + /* + * 640 MB to 768 MB is not used by PFW/bootloader. So, try reserving + * memory for passing additional parameters in this range to avoid + * being stomped on by PFW/bootloader. + */ + range_start = 0x2A000000; + range_end = range_start + 0x4000000; + } + + fw_dump.param_area = memblock_phys_alloc_range(COMMAND_LINE_SIZE, + COMMAND_LINE_SIZE, + range_start, + range_end); + if (!fw_dump.param_area || sysfs_create_file(fadump_kobj, &bootargs_append_attr.attr)) { + pr_warn("WARNING: Could not setup area to pass additional parameters!\n"); + return; + } + + memset(phys_to_virt(fw_dump.param_area), 0, COMMAND_LINE_SIZE); +} + /* * Prepare for firmware-assisted dump. */ @@ -1686,6 +1772,7 @@ int __init setup_fadump(void) } /* Initialize the kernel dump memory structure and register with f/w */ else if (fw_dump.reserve_dump_area_size) { + fadump_setup_param_area(); fw_dump.ops->fadump_init_mem_struct(&fw_dump); register_fadump(); } diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 5a88d7efb48a..c9c1dfb35464 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -665,8 +665,10 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) } } - fadump_conf->ops = &opal_fadump_ops; - fadump_conf->fadump_supported = 1; + fadump_conf->ops = &opal_fadump_ops; + fadump_conf->fadump_supported = 1; + /* TODO: Add support to pass additional parameters */ + fadump_conf->param_area_supported = 0; /* * Firmware supports 32-bit field for size. Align it to PAGE_SIZE diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index 4db78b2bb2a8..eceb3289383e 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -81,6 +82,9 @@ static void __init rtas_fadump_get_config(struct fw_dump *fadump_conf, last_end = base + size; fadump_conf->boot_mem_regs_cnt++; break; + case RTAS_FADUMP_PARAM_AREA: + fadump_conf->param_area = be64_to_cpu(fdm->rgn[i].destination_address); + break; default: pr_warn("Section type %d unsupported on this kernel. Ignoring!\n", type); break; @@ -154,7 +158,17 @@ static u64 rtas_fadump_init_mem_struct(struct fw_dump *fadump_conf) sec_cnt++; } + /* Parameters area */ + if (fadump_conf->param_area) { + fdm.rgn[sec_cnt].request_flag = cpu_to_be32(RTAS_FADUMP_REQUEST_FLAG); + fdm.rgn[sec_cnt].source_data_type = cpu_to_be16(RTAS_FADUMP_PARAM_AREA); + fdm.rgn[sec_cnt].source_address = cpu_to_be64(fadump_conf->param_area); + fdm.rgn[sec_cnt].source_len = cpu_to_be64(COMMAND_LINE_SIZE); + fdm.rgn[sec_cnt].destination_address = cpu_to_be64(fadump_conf->param_area); + sec_cnt++; + } fdm.header.dump_num_sections = cpu_to_be16(sec_cnt); + rtas_fadump_update_config(fadump_conf, &fdm); return addr; @@ -453,6 +467,13 @@ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) return rc; } break; + case RTAS_FADUMP_PARAM_AREA: + if (fdm_active->rgn[i].bytes_dumped != fdm_active->rgn[i].source_len || + fdm_active->rgn[i].error_flags != 0) { + pr_warn("Failed to process additional parameters! Proceeding anyway..\n"); + fadump_conf->param_area = 0; + } + break; default: /* * If the first/crashed kernel added a new region type that the @@ -509,6 +530,13 @@ static void rtas_fadump_region_show(struct fw_dump *fadump_conf, be64_to_cpu(fdm_ptr->rgn[i].source_len), be64_to_cpu(fdm_ptr->rgn[i].bytes_dumped)); break; + case RTAS_FADUMP_PARAM_AREA: + seq_printf(m, "\n[%#016llx-%#016llx]: cmdline append: '%s'\n", + be64_to_cpu(fdm_ptr->rgn[i].destination_address), + be64_to_cpu(fdm_ptr->rgn[i].destination_address) + + be64_to_cpu(fdm_ptr->rgn[i].source_len) - 1, + (char *)__va(be64_to_cpu(fdm_ptr->rgn[i].destination_address))); + break; default: seq_printf(m, "Unknown region type %d : Src: %#016llx, Dest: %#016llx, ", type, be64_to_cpu(fdm_ptr->rgn[i].source_address), @@ -571,9 +599,10 @@ void __init rtas_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) if (!token) return; - fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); - fadump_conf->ops = &rtas_fadump_ops; - fadump_conf->fadump_supported = 1; + fadump_conf->ibm_configure_kernel_dump = be32_to_cpu(*token); + fadump_conf->ops = &rtas_fadump_ops; + fadump_conf->fadump_supported = 1; + fadump_conf->param_area_supported = 1; /* Firmware supports 64-bit value for size, align it to pagesize. */ fadump_conf->max_copy_size = ALIGN_DOWN(U64_MAX, PAGE_SIZE); diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.h b/arch/powerpc/platforms/pseries/rtas-fadump.h index 6740f4981bb8..c109abf6befd 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.h +++ b/arch/powerpc/platforms/pseries/rtas-fadump.h @@ -23,6 +23,9 @@ #define RTAS_FADUMP_HPTE_REGION 0x0002 #define RTAS_FADUMP_REAL_MODE_REGION 0x0011 +/* OS defined sections */ +#define RTAS_FADUMP_PARAM_AREA 0x0100 + /* Dump request flag */ #define RTAS_FADUMP_REQUEST_FLAG 0x00000001 @@ -31,12 +34,12 @@ /* * The Firmware Assisted Dump Memory structure supports a maximum of 10 sections - * in the dump memory structure. Presently, first two sections are used for - * CPU and HPTE data, while the remaining eight sections can be used for - * boot memory regions. + * in the dump memory structure. Presently, three sections are used for + * CPU state data, HPTE & Parameters area, while the remaining seven sections + * can be used for boot memory regions. */ #define MAX_SECTIONS 10 -#define RTAS_FADUMP_MAX_BOOT_MEM_REGS 8 +#define RTAS_FADUMP_MAX_BOOT_MEM_REGS 7 /* Kernel Dump section info */ struct rtas_fadump_section { From 3416c9daa6b13c0e2a656d4e2dee8de95f9a38cf Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 9 May 2024 17:27:55 +0530 Subject: [PATCH 82/85] powerpc/fadump: pass additional parameters when fadump is active Append the additional parameters passed/set in the dedicated parameter area (RTAS_FADUMP_PARAM_AREA) to bootargs in fadump capture kernel. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240509115755.519982-4-hbathini@linux.ibm.com --- arch/powerpc/include/asm/fadump.h | 2 ++ arch/powerpc/kernel/fadump.c | 35 +++++++++++++++++++++++++++++++ arch/powerpc/kernel/prom.c | 3 +++ 3 files changed, 40 insertions(+) diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 526a6a647312..ef40c9b6972a 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -19,12 +19,14 @@ extern int is_fadump_active(void); extern int should_fadump_crash(void); extern void crash_fadump(struct pt_regs *, const char *); extern void fadump_cleanup(void); +extern void fadump_append_bootargs(void); #else /* CONFIG_FA_DUMP */ static inline int is_fadump_active(void) { return 0; } static inline int should_fadump_crash(void) { return 0; } static inline void crash_fadump(struct pt_regs *regs, const char *str) { } static inline void fadump_cleanup(void) { } +static inline void fadump_append_bootargs(void) { } #endif /* !CONFIG_FA_DUMP */ #if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 6d35b09d6f3a..2276bacc4170 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -131,6 +131,41 @@ static int __init fadump_cma_init(void) static int __init fadump_cma_init(void) { return 1; } #endif /* CONFIG_CMA */ +/* + * Additional parameters meant for capture kernel are placed in a dedicated area. + * If this is capture kernel boot, append these parameters to bootargs. + */ +void __init fadump_append_bootargs(void) +{ + char *append_args; + size_t len; + + if (!fw_dump.dump_active || !fw_dump.param_area_supported || !fw_dump.param_area) + return; + + if (fw_dump.param_area >= fw_dump.boot_mem_top) { + if (memblock_reserve(fw_dump.param_area, COMMAND_LINE_SIZE)) { + pr_warn("WARNING: Can't use additional parameters area!\n"); + fw_dump.param_area = 0; + return; + } + } + + append_args = (char *)fw_dump.param_area; + len = strlen(boot_command_line); + + /* + * Too late to fail even if cmdline size exceeds. Truncate additional parameters + * to cmdline size and proceed anyway. + */ + if (len + strlen(append_args) >= COMMAND_LINE_SIZE - 1) + pr_warn("WARNING: Appending parameters exceeds cmdline size. Truncating!\n"); + + pr_debug("Cmdline: %s\n", boot_command_line); + snprintf(boot_command_line + len, COMMAND_LINE_SIZE - len, " %s", append_args); + pr_info("Updated cmdline: %s\n", boot_command_line); +} + /* Scan the Firmware Assisted dump configuration details. */ int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index eb140ea6b6ff..60819751e55e 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -813,6 +813,9 @@ void __init early_init_devtree(void *params) */ of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line); + /* Append additional parameters passed for fadump capture kernel */ + fadump_append_bootargs(); + /* Scan memory nodes and rebuild MEMBLOCKs */ early_init_dt_scan_root(); early_init_dt_scan_memory_ppc(); From 9dc140785961e53b1d45d186961a3b0d374bfc6a Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 10 May 2024 13:51:14 +0530 Subject: [PATCH 83/85] powerpc/fadump: update documentation about bootargs_append Update ABI documentation about the introduction of the new sysfs entry bootargs_append. This sysfs entry will be used to setup the additional parameters to be passed to dump capture kernel. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240510082114.561163-1-hbathini@linux.ibm.com --- Documentation/ABI/testing/sysfs-kernel-fadump | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-fadump b/Documentation/ABI/testing/sysfs-kernel-fadump index c586054657d6..2f9daa7ca55b 100644 --- a/Documentation/ABI/testing/sysfs-kernel-fadump +++ b/Documentation/ABI/testing/sysfs-kernel-fadump @@ -49,3 +49,10 @@ Description: read only memory add/remove events because elfcorehdr is now prepared in the second/fadump kernel. User: kexec-tools + +What: /sys/kernel/fadump/bootargs_append +Date: May 2024 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + This is a special sysfs file available to setup additional + parameters to be passed to capture kernel. From 7b090b6ff51b9a9f002139660672f662b95f0630 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 10 May 2024 13:37:57 +0530 Subject: [PATCH 84/85] powerpc/85xx: fix compile error without CONFIG_CRASH_DUMP Since commit 5c4233cc0920 ("powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency"), crashing_cpu is not available without CONFIG_CRASH_DUMP. Fix compile error on 64-BIT 85xx owing to this change. Fixes: 5c4233cc0920 ("powerpc/kdump: Split KEXEC_CORE and CRASH_DUMP dependency") Cc: stable@vger.kernel.org # v6.9+ Reported-by: Christian Zigotzky Closes: https://lore.kernel.org/all/fa247ae4-5825-4dbe-a737-d93b7ab4d4b9@xenosoft.de/ Suggested-by: Michael Ellerman Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://msgid.link/20240510080757.560159-1-hbathini@linux.ibm.com --- arch/powerpc/platforms/85xx/smp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index 40aa58206888..e52b848b64b7 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -398,6 +398,7 @@ static void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary) hard_irq_disable(); mpic_teardown_this_cpu(secondary); +#ifdef CONFIG_CRASH_DUMP if (cpu == crashing_cpu && cpu_thread_in_core(cpu) != 0) { /* * We enter the crash kernel on whatever cpu crashed, @@ -406,9 +407,11 @@ static void mpc85xx_smp_kexec_cpu_down(int crash_shutdown, int secondary) */ disable_threadbit = 1; disable_cpu = cpu_first_thread_sibling(cpu); - } else if (sibling != crashing_cpu && - cpu_thread_in_core(cpu) == 0 && - cpu_thread_in_core(sibling) != 0) { + } else if (sibling == crashing_cpu) { + return; + } +#endif + if (cpu_thread_in_core(cpu) == 0 && cpu_thread_in_core(sibling) != 0) { disable_threadbit = 2; disable_cpu = sibling; } From 61700f816e6f58f6b1aaa881a69a784d146e30f0 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 16 May 2024 22:14:44 +1000 Subject: [PATCH 85/85] powerpc/fadump: Fix section mismatch warning With some compilers/configs fadump_setup_param_area() isn't inlined into its caller (which is __init), leading to a section mismatch warning: WARNING: modpost: vmlinux: section mismatch in reference: fadump_setup_param_area+0x200 (section: .text.fadump_setup_param_area) -> memblock_phys_alloc_range (section: .init.text) Fix it by adding an __init annotation. Fixes: 683eab94da75 ("powerpc/fadump: setup additional parameters for dump capture kernel") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20240515163708.3380c4d1@canb.auug.org.au/ Reported-by: kernel test robot Closes: https://lore.kernel.org/all/202405140922.oucLOx4Y-lkp@intel.com/ Signed-off-by: Michael Ellerman Link: https://msgid.link/20240516132631.347956-1-mpe@ellerman.id.au --- arch/powerpc/kernel/fadump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 2276bacc4170..60f974775fc8 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1740,7 +1740,7 @@ err_out: * Reserve memory to store additional parameters to be passed * for fadump/capture kernel. */ -static void fadump_setup_param_area(void) +static void __init fadump_setup_param_area(void) { phys_addr_t range_start, range_end;