From b934b7ff5ea755473d9737d79ab303722ceba22e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 12 Jul 2023 23:15:56 +0800 Subject: [PATCH 01/33] rcu: Delete a redundant check in rcu_check_gp_kthread_starvation() The rcu_check_gp_kthread_starvation() function uses task_cpu() to sample the last CPU that the grace-period kthread ran on, and task_cpu() samples the thread_info structure's ->cpu field. But this field will always contain a number corresponding to a CPU that was online some time in the past, thus never a negative number. This invariant is checked by a WARN_ON_ONCE() in set_task_cpu(). This means that if the grace-period kthread exists, that is, if the "gpk" local variable is non-NULL, the "cpu" local variable will be non-negative. This in turn means that the existing check for non-negative "cpu" is redundant with the enclosing check for non-NULL "gpk". This commit threefore removes the redundant check of "cpu". Signed-off-by: Zhen Lei Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 6f06dc12904a..b5ce0580074e 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -537,13 +537,11 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); - if (cpu >= 0) { - if (cpu_is_offline(cpu)) { - pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); - } else { - pr_err("Stack dump where RCU GP kthread last ran:\n"); - dump_cpu_task(cpu); - } + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + dump_cpu_task(cpu); } wake_up_process(gpk); } From f3efe02fd56e2498ad8da4c4f444a34aa1456a46 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 12 Jul 2023 23:15:57 +0800 Subject: [PATCH 02/33] rcu: Don't redump the stalled CPU where RCU GP kthread last ran The stacks of all stalled CPUs will be dumped in rcu_dump_cpu_stacks(). If the CPU on where RCU GP kthread last ran is stalled, its stack does not need to be dumped again. We can search the corresponding backtrace based on the printed CPU ID. For example: [ 87.328275] rcu: rcu_sched kthread starved for ... ->cpu=3 <--------| ... ... | [ 89.385007] NMI backtrace for cpu 3 <--------| [ 89.385179] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.10.0+ #22 <--| [ 89.385188] Hardware name: linux,dummy-virt (DT) [ 89.385196] pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--) [ 89.385204] pc : arch_cpu_idle+0x40/0xc0 [ 89.385211] lr : arch_cpu_idle+0x2c/0xc0 ... ... [ 89.385566] Call trace: [ 89.385574] arch_cpu_idle+0x40/0xc0 [ 89.385581] default_idle_call+0x100/0x450 [ 89.385589] cpuidle_idle_call+0x2f8/0x460 [ 89.385596] do_idle+0x1dc/0x3d0 [ 89.385604] cpu_startup_entry+0x5c/0xb0 [ 89.385613] secondary_start_kernel+0x35c/0x520 Signed-off-by: Zhen Lei Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index b5ce0580074e..fc04a8d7ce96 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -534,12 +534,14 @@ static void rcu_check_gp_kthread_starvation(void) data_race(READ_ONCE(rcu_state.gp_state)), gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu); if (gpk) { + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); if (cpu_is_offline(cpu)) { pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); - } else { + } else if (!(data_race(READ_ONCE(rdp->mynode->qsmask)) & rdp->grpmask)) { pr_err("Stack dump where RCU GP kthread last ran:\n"); dump_cpu_task(cpu); } From 243d5ab34446854ceca55146fc0d837655657f8e Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Mon, 24 Jul 2023 10:26:51 +0800 Subject: [PATCH 03/33] rcu: Eliminate check_cpu_stall() duplicate code The code and comments of self-detected and other-detected RCU CPU stall warnings are identical except the output function. This commit therefore refactors so as to consolidate the duplicate code. Signed-off-by: Zhen Lei Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree_stall.h | 42 +++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index fc04a8d7ce96..2443d1d4a6dc 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -711,7 +711,7 @@ static void print_cpu_stall(unsigned long gps) static void check_cpu_stall(struct rcu_data *rdp) { - bool didstall = false; + bool self_detected; unsigned long gs1; unsigned long gs2; unsigned long gps; @@ -758,10 +758,10 @@ static void check_cpu_stall(struct rcu_data *rdp) return; /* No stall or GP completed since entering function. */ rnp = rdp->mynode; jn = jiffies + ULONG_MAX / 2; + self_detected = READ_ONCE(rnp->qsmask) & rdp->grpmask; if (rcu_gp_in_progress() && - (READ_ONCE(rnp->qsmask) & rdp->grpmask) && + (self_detected || ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) && cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - /* * If a virtual machine is stopped by the host it can look to * the watchdog like an RCU stall. Check to see if the host @@ -770,33 +770,21 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; - /* We haven't checked in, so go dump stack. */ - print_cpu_stall(gps); + if (self_detected) { + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(gps); + } else { + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(gs2, gps); + } + if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) rcu_ftrace_dump(DUMP_ALL); - didstall = true; - } else if (rcu_gp_in_progress() && - ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) && - cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) { - - /* - * If a virtual machine is stopped by the host it can look to - * the watchdog like an RCU stall. Check to see if the host - * stopped the vm. - */ - if (kvm_check_and_clear_guest_paused()) - return; - - /* They had a few time units to dump stack, so complain. */ - print_other_cpu_stall(gs2, gps); - if (READ_ONCE(rcu_cpu_stall_ftrace_dump)) - rcu_ftrace_dump(DUMP_ALL); - didstall = true; - } - if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) { - jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; - WRITE_ONCE(rcu_state.jiffies_stall, jn); + if (READ_ONCE(rcu_state.jiffies_stall) == jn) { + jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; + WRITE_ONCE(rcu_state.jiffies_stall, jn); + } } } From 5b404fdabacf4bee92d8c66013402a85f18a6a10 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Aug 2023 15:46:32 -0700 Subject: [PATCH 04/33] rcu: Add RCU CPU stall notifier It is sometimes helpful to have a way for the subsystem causing the stall to dump its state when an RCU CPU stall occurs. This commit therefore bases rcu_stall_chain_notifier_register() and rcu_stall_chain_notifier_unregister() on atomic notifiers in order to provide this functionality. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/linux/rcu_notifier.h | 32 +++++++++++++++++++ kernel/rcu/rcu.h | 6 ++++ kernel/rcu/tree_exp.h | 6 +++- kernel/rcu/tree_stall.h | 59 +++++++++++++++++++++++++++++++++++- 4 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 include/linux/rcu_notifier.h diff --git a/include/linux/rcu_notifier.h b/include/linux/rcu_notifier.h new file mode 100644 index 000000000000..ebf371364581 --- /dev/null +++ b/include/linux/rcu_notifier.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Read-Copy Update notifiers, initially RCU CPU stall notifier. + * Separate from rcupdate.h to avoid #include loops. + * + * Copyright (C) 2023 Paul E. McKenney. + */ + +#ifndef __LINUX_RCU_NOTIFIER_H +#define __LINUX_RCU_NOTIFIER_H + +// Actions for RCU CPU stall notifier calls. +#define RCU_STALL_NOTIFY_NORM 1 +#define RCU_STALL_NOTIFY_EXP 2 + +#ifdef CONFIG_RCU_STALL_COMMON + +#include +#include + +int rcu_stall_chain_notifier_register(struct notifier_block *n); +int rcu_stall_chain_notifier_unregister(struct notifier_block *n); + +#else // #ifdef CONFIG_RCU_STALL_COMMON + +// No RCU CPU stall warnings in Tiny RCU. +static inline int rcu_stall_chain_notifier_register(struct notifier_block *n) { return -EEXIST; } +static inline int rcu_stall_chain_notifier_unregister(struct notifier_block *n) { return -ENOENT; } + +#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON + +#endif /* __LINUX_RCU_NOTIFIER_H */ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98e13be411af..ef3bab977407 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -654,4 +654,10 @@ static inline bool rcu_cpu_beenfullyonline(int cpu) { return true; } bool rcu_cpu_beenfullyonline(int cpu); #endif +#ifdef CONFIG_RCU_STALL_COMMON +int rcu_stall_notifier_call_chain(unsigned long val, void *v); +#else // #ifdef CONFIG_RCU_STALL_COMMON +static inline int rcu_stall_notifier_call_chain(unsigned long val, void *v) { return NOTIFY_DONE; } +#endif // #else // #ifdef CONFIG_RCU_STALL_COMMON + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8239b39d945b..6d7cea5d591f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -621,10 +621,14 @@ static void synchronize_rcu_expedited_wait(void) } for (;;) { + unsigned long j; + if (synchronize_rcu_expedited_wait_once(jiffies_stall)) return; if (rcu_stall_is_suppressed()) continue; + j = jiffies; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_EXP, (void *)(j - jiffies_start)); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -647,7 +651,7 @@ static void synchronize_rcu_expedited_wait(void) } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rcu_state.expedited_sequence, + j - jiffies_start, rcu_state.expedited_sequence, data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 2443d1d4a6dc..49544f932279 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -8,6 +8,7 @@ */ #include +#include ////////////////////////////////////////////////////////////////////////////// // @@ -770,6 +771,7 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); if (self_detected) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(gps); @@ -790,7 +792,7 @@ static void check_cpu_stall(struct rcu_data *rdp) ////////////////////////////////////////////////////////////////////////////// // -// RCU forward-progress mechanisms, including of callback invocation. +// RCU forward-progress mechanisms, including for callback invocation. /* @@ -1042,3 +1044,58 @@ static int __init rcu_sysrq_init(void) return 0; } early_initcall(rcu_sysrq_init); + + +////////////////////////////////////////////////////////////////////////////// +// +// RCU CPU stall-warning notifiers + +static ATOMIC_NOTIFIER_HEAD(rcu_cpu_stall_notifier_list); + +/** + * rcu_stall_chain_notifier_register - Add an RCU CPU stall notifier + * @n: Entry to add. + * + * Adds an RCU CPU stall notifier to an atomic notifier chain. + * The @action passed to a notifier will be @RCU_STALL_NOTIFY_NORM or + * friends. The @data will be the duration of the stalled grace period, + * in jiffies, coerced to a void* pointer. + * + * Returns 0 on success, %-EEXIST on error. + */ +int rcu_stall_chain_notifier_register(struct notifier_block *n) +{ + return atomic_notifier_chain_register(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_register); + +/** + * rcu_stall_chain_notifier_unregister - Remove an RCU CPU stall notifier + * @n: Entry to add. + * + * Removes an RCU CPU stall notifier from an atomic notifier chain. + * + * Returns zero on success, %-ENOENT on failure. + */ +int rcu_stall_chain_notifier_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&rcu_cpu_stall_notifier_list, n); +} +EXPORT_SYMBOL_GPL(rcu_stall_chain_notifier_unregister); + +/* + * rcu_stall_notifier_call_chain - Call functions in an RCU CPU stall notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * + * Calls each function in the RCU CPU stall notifier chain in turn, which + * is an atomic call chain. See atomic_notifier_call_chain() for more + * information. + * + * This is for use within RCU, hence the omission of the extra asterisk + * to indicate a non-kerneldoc format header comment. + */ +int rcu_stall_notifier_call_chain(unsigned long val, void *v) +{ + return atomic_notifier_call_chain(&rcu_cpu_stall_notifier_list, val, v); +} From 7c1b3e0c988f2902695ef6175ab8ad00c0e8b65f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Aug 2023 15:53:32 -0700 Subject: [PATCH 05/33] rcutorture: Add test of RCU CPU stall notifiers This commit registers an RCU CPU stall notifier when testing RCU CPU stalls. The notifier logs a message similar to the following: rcu_torture_stall_nf: v=1, duration=21001. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcutorture.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ade42d6a9d9b..1830cacac01d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -2428,6 +2429,16 @@ static int rcutorture_booster_init(unsigned int cpu) return 0; } +static int rcu_torture_stall_nf(struct notifier_block *nb, unsigned long v, void *ptr) +{ + pr_info("%s: v=%lu, duration=%lu.\n", __func__, v, (unsigned long)ptr); + return NOTIFY_OK; +} + +static struct notifier_block rcu_torture_stall_block = { + .notifier_call = rcu_torture_stall_nf, +}; + /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then * induces a CPU stall for the time specified by stall_cpu. @@ -2435,9 +2446,14 @@ static int rcutorture_booster_init(unsigned int cpu) static int rcu_torture_stall(void *args) { int idx; + int ret; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); + ret = rcu_stall_chain_notifier_register(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_register() returned %d, %sexpected.\n", + __func__, ret, !IS_ENABLED(CONFIG_RCU_STALL_COMMON) ? "un" : ""); if (stall_cpu_holdoff > 0) { VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff"); schedule_timeout_interruptible(stall_cpu_holdoff * HZ); @@ -2481,6 +2497,11 @@ static int rcu_torture_stall(void *args) cur_ops->readunlock(idx); } pr_alert("%s end.\n", __func__); + if (!ret) { + ret = rcu_stall_chain_notifier_unregister(&rcu_torture_stall_block); + if (ret) + pr_info("%s: rcu_stall_chain_notifier_unregister() returned %d.\n", __func__, ret); + } torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); From b96e7a5fa0ba9cda32888e04f8f4bac42d49a7f8 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 5 Sep 2023 00:02:11 +0000 Subject: [PATCH 06/33] rcu/tree: Defer setting of jiffies during stall reset There are instances where rcu_cpu_stall_reset() is called when jiffies did not get a chance to update for a long time. Before jiffies is updated, the CPU stall detector can go off triggering false-positives where a just-started grace period appears to be ages old. In the past, we disabled stall detection in rcu_cpu_stall_reset() however this got changed [1]. This is resulting in false-positives in KGDB usecase [2]. Fix this by deferring the update of jiffies to the third run of the FQS loop. This is more robust, as, even if rcu_cpu_stall_reset() is called just before jiffies is read, we would end up pushing out the jiffies read by 3 more FQS loops. Meanwhile the CPU stall detection will be delayed and we will not get any false positives. [1] https://lore.kernel.org/all/20210521155624.174524-2-senozhatsky@chromium.org/ [2] https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/ Tested with rcutorture.cpu_stall option as well to verify stall behavior with/without patch. Tested-by: Huacai Chen Reported-by: Binbin Zhou Closes: https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/ Suggested-by: Paul McKenney Cc: Sergey Senozhatsky Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: a80be428fbc1 ("rcu: Do not disable GP stall detection in rcu_cpu_stall_reset()") Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 12 ++++++++++++ kernel/rcu/tree.h | 4 ++++ kernel/rcu/tree_stall.h | 20 ++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..d85779f67aea 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1556,10 +1556,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp) */ static void rcu_gp_fqs(bool first_time) { + int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall); struct rcu_node *rnp = rcu_get_root(); WRITE_ONCE(rcu_state.gp_activity, jiffies); WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1); + + WARN_ON_ONCE(nr_fqs > 3); + /* Only countdown nr_fqs for stall purposes if jiffies moves. */ + if (nr_fqs) { + if (nr_fqs == 1) { + WRITE_ONCE(rcu_state.jiffies_stall, + jiffies + rcu_jiffies_till_stall_check()); + } + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs); + } + if (first_time) { /* Collect dyntick-idle snapshots. */ force_qs_rnp(dyntick_save_progress_counter); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 192536916f9a..e9821a8422db 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -386,6 +386,10 @@ struct rcu_state { /* in jiffies. */ unsigned long jiffies_stall; /* Time at which to check */ /* for CPU stalls. */ + int nr_fqs_jiffies_stall; /* Number of fqs loops after + * which read jiffies and set + * jiffies_stall. Stall + * warnings disabled if !0. */ unsigned long jiffies_resched; /* Time at which to resched */ /* a reluctant CPU. */ unsigned long n_force_qs_gpstart; /* Snapshot of n_force_qs at */ diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 49544f932279..ac8e86babe44 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -150,12 +150,17 @@ static void panic_on_rcu_stall(void) /** * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period * + * To perform the reset request from the caller, disable stall detection until + * 3 fqs loops have passed. This is required to ensure a fresh jiffies is + * loaded. It should be safe to do from the fqs loop as enough timer + * interrupts and context switches should have passed. + * * The caller must disable hard irqs. */ void rcu_cpu_stall_reset(void) { - WRITE_ONCE(rcu_state.jiffies_stall, - jiffies + rcu_jiffies_till_stall_check()); + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3); + WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX); } ////////////////////////////////////////////////////////////////////////////// @@ -171,6 +176,7 @@ static void record_gp_stall_check_time(void) WRITE_ONCE(rcu_state.gp_start, j); j1 = rcu_jiffies_till_stall_check(); smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq. + WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0); WRITE_ONCE(rcu_state.jiffies_stall, j + j1); rcu_state.jiffies_resched = j + j1 / 2; rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs); @@ -726,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp) !rcu_gp_in_progress()) return; rcu_stall_kick_kthreads(); + + /* + * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS + * loop has to set jiffies to ensure a non-stale jiffies value. This + * is required to have good jiffies value after coming out of long + * breaks of jiffies updates. Not doing so can cause false positives. + */ + if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0) + return; + j = jiffies; /* From 92a708dc1fb8c33b0017ad77dc7ff6e434f96ee2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Jul 2023 13:13:46 -0700 Subject: [PATCH 07/33] rcu-tasks: Add printk()s to localize boot-time self-test hang Currently, rcu_tasks_initiate_self_tests() prints a message and then initiates self tests on up to three different RCU Tasks flavors. If one of the flavors has a grace-period hang, it is not easy to work out which of the three hung. This commit therefore prints a message prior to each individual test. Reported-by: Guenter Roeck Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8d65f7d576a3..83049a893de5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1979,20 +1979,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) static void rcu_tasks_initiate_self_tests(void) { - pr_info("Running RCU-tasks wait API self tests\n"); #ifdef CONFIG_TASKS_RCU + pr_info("Running RCU Tasks wait API self tests\n"); tests[0].runstart = jiffies; synchronize_rcu_tasks(); call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_RUDE_RCU + pr_info("Running RCU Tasks Rude wait API self tests\n"); tests[1].runstart = jiffies; synchronize_rcu_tasks_rude(); call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); #endif #ifdef CONFIG_TASKS_TRACE_RCU + pr_info("Running RCU Tasks Trace wait API self tests\n"); tests[2].runstart = jiffies; synchronize_rcu_tasks_trace(); call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); From e62d8ae4620865411d1b2347980aa28ccf891a3d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Aug 2023 13:42:00 -0700 Subject: [PATCH 08/33] rcu-tasks: Pull sampling of ->percpu_dequeue_lim out of loop The rcu_tasks_need_gpcb() samples ->percpu_dequeue_lim as part of the condition clause of a "for" loop, which is a bit confusing. This commit therefore hoists this sampling out of the loop, using the result loaded in the condition clause. So why does this work in the face of a concurrent switch from single-CPU queueing to per-CPU queueing? o The call_rcu_tasks_generic() that makes the change has already enqueued its callback, which means that all of the other CPU's callback queues are empty. o For the call_rcu_tasks_generic() that first notices the switch to per-CPU queues, the smp_store_release() used to update ->percpu_enqueue_lim pairs with the raw_spin_trylock_rcu_node()'s full barrier that is between the READ_ONCE(rtp->percpu_enqueue_shift) and the rcu_segcblist_enqueue() that enqueues the callback. o Because this CPU's queue is empty (unless it happens to be the original single queue, in which case there is no need for synchronization), this call_rcu_tasks_generic() will do an irq_work_queue() to schedule a handler for the needed rcuwait_wake_up() call. This call will be ordered after the first call_rcu_tasks_generic() function's change to ->percpu_dequeue_lim. o This rcuwait_wake_up() will either happen before or after the set_current_state() in rcuwait_wait_event(). If it happens before, the "condition" argument's call to rcu_tasks_need_gpcb() will be ordered after the original change, and all callbacks on all CPUs will be visible. Otherwise, if it happens after, then the grace-period kthread's state will be set back to running, which will result in a later call to rcuwait_wait_event() and thus to rcu_tasks_need_gpcb(), which will again see the change. So it all works out. Suggested-by: Linus Torvalds Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 83049a893de5..94bb5abdbb37 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) { int cpu; + int dequeue_limit; unsigned long flags; bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq); long n; @@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) long ncbsnz = 0; int needgpcb = 0; - for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { + dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim); + for (cpu = 0; cpu < dequeue_limit; cpu++) { struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); /* Advance and accelerate any new callbacks. */ From 0325e8a1282dfd30c3e44928c6384bb978649c63 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 3 Aug 2023 16:06:59 +0800 Subject: [PATCH 09/33] rcu-tasks: Make rcu_tasks_lazy_ms static The rcu_tasks_lazy_ms variable is not used outside the file tasks.h, so this commit marks it static. kernel/rcu/tasks.h:1085:5: warning: symbol 'rcu_tasks_lazy_ms' was not declared. Should it be static? Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=6086 Signed-off-by: Jiapeng Chong Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 94bb5abdbb37..018f03f20629 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1086,7 +1086,7 @@ void rcu_barrier_tasks(void) } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); -int rcu_tasks_lazy_ms = -1; +static int rcu_tasks_lazy_ms = -1; module_param(rcu_tasks_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_kthread(void) From 730c3ed4ba30feadfb20b4d4d18e869bc00348f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Aug 2023 09:30:18 -0700 Subject: [PATCH 10/33] refscale: Fix misplaced data re-read This commit fixes a misplaced data re-read in the typesafe code. The reason that this was not noticed is that this is a performance test with no writers, so a mismatch could not occur. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/refscale.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 91a0fd0d4d9a..750a63e99539 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -655,12 +655,12 @@ retry: goto retry; } un_delay(udl, ndl); + b = READ_ONCE(rtsp->a); // Remember, seqlock read-side release can fail. if (!rts_release(rtsp, start)) { rcu_read_unlock(); goto retry; } - b = READ_ONCE(rtsp->a); WARN_ONCE(a != b, "Re-read of ->a changed from %u to %u.\n", a, b); b = rtsp->b; rcu_read_unlock(); From d6fea1dde2064c8f298dbe872587c7ace7701b3f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Aug 2023 11:27:26 -0700 Subject: [PATCH 11/33] refscale: Print out additional module parameters The refscale.verbose_batched and refscale.lookup_instances module parameters are omitted from the ref_scale_print_module_parms() beginning-of-test output. This commit therefore adds them. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/refscale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 750a63e99539..2c2648a3ad30 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -1025,8 +1025,8 @@ static void ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag) { pr_alert("%s" SCALE_FLAG - "--- %s: verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, - verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay); + "--- %s: verbose=%d verbose_batched=%d shutdown=%d holdoff=%d lookup_instances=%ld loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag, + verbose, verbose_batched, shutdown, holdoff, lookup_instances, loops, nreaders, nruns, readdelay); } static void From 8a4c0c90f2796fd3ac96326e8e2f8b13f7bdc99c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 16 Aug 2023 11:29:05 -0700 Subject: [PATCH 12/33] doc: Add refscale.lookup_instances to kernel-parameters.txt This commit adds refscale.lookup_instances to kernel-parameters.txt. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..cf4f3262ce31 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5422,6 +5422,12 @@ test until boot completes in order to avoid interference. + refscale.lookup_instances= [KNL] + Number of data elements to use for the forms of + SLAB_TYPESAFE_BY_RCU testing. A negative number + is negated and multiplied by nr_cpu_ids, while + zero specifies nr_cpu_ids. + refscale.loops= [KNL] Set the number of loops over the synchronization primitive under test. Increasing this number From ebbb9d35fd8497591760779b3d5275fb4ce0e50d Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Tue, 8 Aug 2023 23:58:11 +0800 Subject: [PATCH 13/33] Documentation: RCU: Fix section numbers after adding Section 7 in whatisRCU.rst Signed-off-by: Wei Zhang Reviewed-by: Randy Dunlap Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- Documentation/RCU/whatisRCU.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index e488c8e557a9..60ce02475142 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -59,8 +59,8 @@ experiment with should focus on Section 2. People who prefer to start with example uses should focus on Sections 3 and 4. People who need to understand the RCU implementation should focus on Section 5, then dive into the kernel source code. People who reason best by analogy should -focus on Section 6. Section 7 serves as an index to the docbook API -documentation, and Section 8 is the traditional answer key. +focus on Section 6 and 7. Section 8 serves as an index to the docbook +API documentation, and Section 9 is the traditional answer key. So, start with the section that makes the most sense to you and your preferred method of learning. If you need to know everything about From 082acfe39cb0090e97bf27057d0efdf1e89abbef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Aug 2023 21:04:02 +0100 Subject: [PATCH 14/33] rcu: Describe listRCU read-side guarantees More explicitly state what is, and what is not guaranteed to those who iterate a list while protected by RCU. [ paulmck: Apply Joel Fernandes feedback. ] Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- Documentation/RCU/listRCU.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/RCU/listRCU.rst b/Documentation/RCU/listRCU.rst index bdc4bcc5289f..ed5c9d8c9afe 100644 --- a/Documentation/RCU/listRCU.rst +++ b/Documentation/RCU/listRCU.rst @@ -8,6 +8,15 @@ One of the most common uses of RCU is protecting read-mostly linked lists that all of the required memory ordering is provided by the list macros. This document describes several list-based RCU use cases. +When iterating a list while holding the rcu_read_lock(), writers may +modify the list. The reader is guaranteed to see all of the elements +which were added to the list before they acquired the rcu_read_lock() +and are still on the list when they drop the rcu_read_unlock(). +Elements which are added to, or removed from the list may or may not +be seen. If the writer calls list_replace_rcu(), the reader may see +either the old element or the new element; they will not see both, +nor will they see neither. + Example 1: Read-mostly list: Deferred Destruction ------------------------------------------------- From ff18cb816427c3738d5d3fd23b7939a813935e15 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 25 Jul 2023 23:29:10 +0000 Subject: [PATCH 15/33] Revert "checkpatch: Error out if deprecated RCU API used" The definition for single-argument kfree_rcu() has been removed, so that any further attempt to use it will result in a build error. Because of this build error, there is no longer any need for a special check in checkpatch.pl. Therefore, revert commit 1eacac3255495be7502d406e2ba5444fb5c3607c. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- scripts/checkpatch.pl | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7d16f863edf1..25fdb7fda112 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6427,15 +6427,6 @@ sub process { } } -# check for soon-to-be-deprecated single-argument k[v]free_rcu() API - if ($line =~ /\bk[v]?free_rcu\s*\([^(]+\)/) { - if ($line =~ /\bk[v]?free_rcu\s*\([^,]+\)/) { - ERROR("DEPRECATED_API", - "Single-argument k[v]free_rcu() API is deprecated, please pass rcu_head object or call k[v]free_rcu_mightsleep()." . $herecurr); - } - } - - # check for unnecessary "Out of Memory" messages if ($line =~ /^\+.*\b$logFunctions\s*\(/ && $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ && From f0a31b26be1f725e3f56beed76486c1b034120b3 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 29 Jul 2023 14:27:32 +0000 Subject: [PATCH 16/33] srcu: Fix error handling in init_srcu_struct_fields() The current error handling in init_srcu_struct_fields() is a bit inconsistent. If init_srcu_struct_nodes() fails, the function either returns -ENOMEM or 0 depending on whether ssp->sda_is_static is true or false. This can make init_srcu_struct_fields() return 0 even if memory allocation failed! Simplify the error handling by always returning -ENOMEM if either init_srcu_struct_nodes() or the per-CPU allocation fails. This makes the control flow easier to follow and avoids the inconsistent return values. Add goto labels to avoid duplicating the error cleanup code. Link: https://lore.kernel.org/r/20230404003508.GA254019@google.com Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 20d7a238d675..f1a905200fc2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -255,29 +255,31 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) ssp->srcu_sup->sda_is_static = is_static; if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); - if (!ssp->sda) { - if (!is_static) - kfree(ssp->srcu_sup); - return -ENOMEM; - } + if (!ssp->sda) + goto err_free_sup; init_srcu_struct_data(ssp); ssp->srcu_sup->srcu_gp_seq_needed_exp = 0; ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { - if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) { - if (!ssp->srcu_sup->sda_is_static) { - free_percpu(ssp->sda); - ssp->sda = NULL; - kfree(ssp->srcu_sup); - return -ENOMEM; - } - } else { - WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); - } + if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) + goto err_free_sda; + WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); } ssp->srcu_sup->srcu_ssp = ssp; smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */ return 0; + +err_free_sda: + if (!is_static) { + free_percpu(ssp->sda); + ssp->sda = NULL; + } +err_free_sup: + if (!is_static) { + kfree(ssp->srcu_sup); + ssp->srcu_sup = NULL; + } + return -ENOMEM; } #ifdef CONFIG_DEBUG_LOCK_ALLOC From 4502138acc8f4139c94ce0ec0eee926f8805fbbc Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 29 Jul 2023 14:27:36 +0000 Subject: [PATCH 17/33] rcu/tree: Remove superfluous return from void call_rcu* functions The return keyword is not needed here. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..7c79480bfaa0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2713,7 +2713,7 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) */ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, false); + __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); #endif @@ -2764,7 +2764,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - return __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); } EXPORT_SYMBOL_GPL(call_rcu); From 16128b1f8c823438dcd3f3b4a57cbe7267bcf82f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Aug 2023 17:15:25 -0700 Subject: [PATCH 18/33] rcu: Add sysfs to provide throttled access to rcu_barrier() When running a series of stress tests all making heavy use of RCU, it is all too possible to OOM the system when the prior test's RCU callbacks don't get invoked until after the subsequent test starts. One way of handling this is just a timed wait, but this fails when a given CPU has so many callbacks queued that they take longer to invoke than allowed for by that timed wait. This commit therefore adds an rcutree.do_rcu_barrier module parameter that is accessible from sysfs. Writing one of the many synonyms for boolean "true" will cause an rcu_barrier() to be invoked, but will guarantee that no more than one rcu_barrier() will be invoked per sixteenth of a second via this mechanism. The flip side is that a given request might wait a second or three longer than absolutely necessary, but only when there are multiple uses of rcutree.do_rcu_barrier within a one-second time interval. This commit unnecessarily serializes the rcu_barrier() machinery, given that serialization is already provided by procfs. This has the advantage of allowing throttled rcu_barrier() from other sources within the kernel. Reported-by: Johannes Weiner Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../admin-guide/kernel-parameters.txt | 7 ++ kernel/rcu/tree.c | 76 +++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0a1731a0f0ef..7ec8a406d419 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4769,6 +4769,13 @@ Set maximum number of finished RCU callbacks to process in one batch. + rcutree.do_rcu_barrier= [KNL] + Request a call to rcu_barrier(). This is + throttled so that userspace tests can safely + hammer on the sysfs variable if they so choose. + If triggered before the RCU grace-period machinery + is fully active, this will error out with EAGAIN. + rcutree.dump_tree= [KNL] Dump the structure of the rcu_node combining tree out at early boot. This is used for diagnostic diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7c79480bfaa0..3c7281fc25a7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4083,6 +4083,82 @@ retry: } EXPORT_SYMBOL_GPL(rcu_barrier); +static unsigned long rcu_barrier_last_throttle; + +/** + * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second + * + * This can be thought of as guard rails around rcu_barrier() that + * permits unrestricted userspace use, at least assuming the hardware's + * try_cmpxchg() is robust. There will be at most one call per second to + * rcu_barrier() system-wide from use of this function, which means that + * callers might needlessly wait a second or three. + * + * This is intended for use by test suites to avoid OOM by flushing RCU + * callbacks from the previous test before starting the next. See the + * rcutree.do_rcu_barrier module parameter for more information. + * + * Why not simply make rcu_barrier() more scalable? That might be + * the eventual endpoint, but let's keep it simple for the time being. + * Note that the module parameter infrastructure serializes calls to a + * given .set() function, but should concurrent .set() invocation ever be + * possible, we are ready! + */ +static void rcu_barrier_throttled(void) +{ + unsigned long j = jiffies; + unsigned long old = READ_ONCE(rcu_barrier_last_throttle); + unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence); + + while (time_in_range(j, old, old + HZ / 16) || + !try_cmpxchg(&rcu_barrier_last_throttle, &old, j)) { + schedule_timeout_idle(HZ / 16); + if (rcu_seq_done(&rcu_state.barrier_sequence, s)) { + smp_mb(); /* caller's subsequent code after above check. */ + return; + } + j = jiffies; + old = READ_ONCE(rcu_barrier_last_throttle); + } + rcu_barrier(); +} + +/* + * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier + * request arrives. We insist on a true value to allow for possible + * future expansion. + */ +static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp) +{ + bool b; + int ret; + + if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) + return -EAGAIN; + ret = kstrtobool(val, &b); + if (!ret && b) { + atomic_inc((atomic_t *)kp->arg); + rcu_barrier_throttled(); + atomic_dec((atomic_t *)kp->arg); + } + return ret; +} + +/* + * Output the number of outstanding rcutree.do_rcu_barrier requests. + */ +static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%d\n", atomic_read((atomic_t *)kp->arg)); +} + +static const struct kernel_param_ops do_rcu_barrier_ops = { + .set = param_set_do_rcu_barrier, + .get = param_get_do_rcu_barrier, +}; +static atomic_t do_rcu_barrier; +module_param_cb(do_rcu_barrier, &do_rcu_barrier_ops, &do_rcu_barrier, 0644); + /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is From b93c5fe16e4aa177cd072c1c4652cbe1b19a7812 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 3 Aug 2023 21:49:19 +0800 Subject: [PATCH 19/33] rcu: Remove unused function declaration rcu_eqs_special_set() Commit a86baa69c2b7 ("rcu: Remove special bit at the bottom of the ->dynticks counter") left behind this, remove it. Signed-off-by: Yue Haibing Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/rcutree.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 126f6b418f6a..153cfc7bbffd 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -37,7 +37,6 @@ void synchronize_rcu_expedited(void); void kvfree_call_rcu(struct rcu_head *head, void *ptr); void rcu_barrier(void); -bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); From 6e284c55fc0bef7d25fd34d29db11f483da60ea4 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 5 Aug 2023 11:17:25 +0800 Subject: [PATCH 20/33] mm: Remove kmem_valid_obj() Function kmem_dump_obj() will splat if passed a pointer to a non-slab object. So nothing calls it directly, instead calling kmem_valid_obj() first to determine whether the passed pointer to a valid slab object. This means that merging kmem_valid_obj() into kmem_dump_obj() will make the code more concise. Therefore, convert kmem_dump_obj() to work the same way as vmalloc_dump_obj(), removing the need for the kmem_dump_obj() caller to check kmem_valid_obj(). After this, there are no remaining calls to kmem_valid_obj() anymore, and it can be safely removed. Suggested-by: Matthew Wilcox Signed-off-by: Zhen Lei Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/slab.h | 5 +++-- mm/slab_common.c | 41 +++++++++++------------------------------ mm/util.c | 4 +--- 3 files changed, 15 insertions(+), 35 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 8228d1276a2f..ff56ab804bf6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -245,8 +245,9 @@ DEFINE_FREE(kfree, void *, if (_T) kfree(_T)) size_t ksize(const void *objp); #ifdef CONFIG_PRINTK -bool kmem_valid_obj(void *object); -void kmem_dump_obj(void *object); +bool kmem_dump_obj(void *object); +#else +static inline bool kmem_dump_obj(void *object) { return false; } #endif /* diff --git a/mm/slab_common.c b/mm/slab_common.c index cd71f9581e67..a425bedf2103 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -528,26 +528,6 @@ bool slab_is_available(void) } #ifdef CONFIG_PRINTK -/** - * kmem_valid_obj - does the pointer reference a valid slab object? - * @object: pointer to query. - * - * Return: %true if the pointer is to a not-yet-freed object from - * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer - * is to an already-freed object, and %false otherwise. - */ -bool kmem_valid_obj(void *object) -{ - struct folio *folio; - - /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ - if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) - return false; - folio = virt_to_folio(object); - return folio_test_slab(folio); -} -EXPORT_SYMBOL_GPL(kmem_valid_obj); - static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { if (__kfence_obj_info(kpp, object, slab)) @@ -566,11 +546,11 @@ static void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab * * and, if available, the slab name, return address, and stack trace from * the allocation and last free path of that object. * - * This function will splat if passed a pointer to a non-slab object. - * If you are not sure what type of object you have, you should instead - * use mem_dump_obj(). + * Return: %true if the pointer is to a not-yet-freed object from + * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer + * is to an already-freed object, and %false otherwise. */ -void kmem_dump_obj(void *object) +bool kmem_dump_obj(void *object) { char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; int i; @@ -578,13 +558,13 @@ void kmem_dump_obj(void *object) unsigned long ptroffset; struct kmem_obj_info kp = { }; - if (WARN_ON_ONCE(!virt_addr_valid(object))) - return; + /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ + if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) + return false; slab = virt_to_slab(object); - if (WARN_ON_ONCE(!slab)) { - pr_cont(" non-slab memory.\n"); - return; - } + if (!slab) + return false; + kmem_obj_info(&kp, object, slab); if (kp.kp_slab_cache) pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); @@ -621,6 +601,7 @@ void kmem_dump_obj(void *object) pr_info(" %pS\n", kp.kp_free_stack[i]); } + return true; } EXPORT_SYMBOL_GPL(kmem_dump_obj); #endif diff --git a/mm/util.c b/mm/util.c index 8cbbfd3a3d59..6eddd891198e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1060,10 +1060,8 @@ void mem_dump_obj(void *object) { const char *type; - if (kmem_valid_obj(object)) { - kmem_dump_obj(object); + if (kmem_dump_obj(object)) return; - } if (vmalloc_dump_obj(object)) return; From 2cbc482d325ee58001472c4359b311958c4efdd1 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 5 Aug 2023 11:17:26 +0800 Subject: [PATCH 21/33] rcu: Dump memory object info if callback function is invalid When a structure containing an RCU callback rhp is (incorrectly) freed and reallocated after rhp is passed to call_rcu(), it is not unusual for rhp->func to be set to NULL. This defeats the debugging prints used by __call_rcu_common() in kernels built with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, which expect to identify the offending code using the identity of this function. And in kernels build without CONFIG_DEBUG_OBJECTS_RCU_HEAD=y, things are even worse, as can be seen from this splat: Unable to handle kernel NULL pointer dereference at virtual address 0 ... ... PC is at 0x0 LR is at rcu_do_batch+0x1c0/0x3b8 ... ... (rcu_do_batch) from (rcu_core+0x1d4/0x284) (rcu_core) from (__do_softirq+0x24c/0x344) (__do_softirq) from (__irq_exit_rcu+0x64/0x108) (__irq_exit_rcu) from (irq_exit+0x8/0x10) (irq_exit) from (__handle_domain_irq+0x74/0x9c) (__handle_domain_irq) from (gic_handle_irq+0x8c/0x98) (gic_handle_irq) from (__irq_svc+0x5c/0x94) (__irq_svc) from (arch_cpu_idle+0x20/0x3c) (arch_cpu_idle) from (default_idle_call+0x4c/0x78) (default_idle_call) from (do_idle+0xf8/0x150) (do_idle) from (cpu_startup_entry+0x18/0x20) (cpu_startup_entry) from (0xc01530) This commit therefore adds calls to mem_dump_obj(rhp) to output some information, for example: slab kmalloc-256 start ffff410c45019900 pointer offset 0 size 256 This provides the rough size of the memory block and the offset of the rcu_head structure, which as least provides at least a few clues to help locate the problem. If the problem is reproducible, additional slab debugging can be enabled, for example, CONFIG_DEBUG_SLAB=y, which can provide significantly more information. Signed-off-by: Zhen Lei Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcu.h | 7 +++++++ kernel/rcu/srcutiny.c | 1 + kernel/rcu/srcutree.c | 1 + kernel/rcu/tasks.h | 1 + kernel/rcu/tiny.c | 1 + kernel/rcu/tree.c | 1 + 6 files changed, 12 insertions(+) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 98e13be411af..d612731feea4 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -10,6 +10,7 @@ #ifndef __LINUX_RCU_H #define __LINUX_RCU_H +#include #include /* @@ -248,6 +249,12 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_callback(struct rcu_head *rhp) +{ + if (unlikely(!rhp->func)) + kmem_dump_obj(rhp); +} + extern int rcu_cpu_stall_suppress_at_boot; static inline bool rcu_stall_is_suppressed_at_boot(void) diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 336af24e0fe3..c38e5933a5d6 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -138,6 +138,7 @@ void srcu_drive_gp(struct work_struct *wp) while (lh) { rhp = lh; lh = lh->next; + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index f1a905200fc2..833a8f848a90 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1710,6 +1710,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8d65f7d576a3..7c845532a50a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -538,6 +538,7 @@ static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); len = rcl.len; for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_callback(rhp); local_bh_disable(); rhp->func(rhp); local_bh_enable(); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 42f7589e51e0..fec804b79080 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -97,6 +97,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) trace_rcu_invoke_callback("", head); f = head->func; + debug_rcu_head_callback(head); WRITE_ONCE(head->func, (rcu_callback_t)0L); f(head); rcu_lock_release(&rcu_callback_map); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3c7281fc25a7..aae515071ffd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2135,6 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_invoke_callback(rcu_state.name, rhp); f = rhp->func; + debug_rcu_head_callback(rhp); WRITE_ONCE(rhp->func, (rcu_callback_t)0L); f(rhp); From 0ae9942f03d0d034fdb0a4f44fc99f62a3107987 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Aug 2023 08:53:58 -0700 Subject: [PATCH 22/33] rcu: Eliminate rcu_gp_slow_unregister() false positive When using rcutorture as a module, there are a number of conditions that can abort the modprobe operation, for example, when attempting to run both RCU CPU stall warning tests and forward-progress tests. This can cause rcu_torture_cleanup() to be invoked on the unwind path out of rcu_rcu_torture_init(), which will mean that rcu_gp_slow_unregister() is invoked without a matching rcu_gp_slow_register(). This will cause a splat because rcu_gp_slow_unregister() is passed rcu_fwd_cb_nodelay, which does not match a NULL pointer. This commit therefore forgives a mismatch involving a NULL pointer, thus avoiding this false-positive splat. Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index aae515071ffd..a83ecab77917 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1260,7 +1260,7 @@ EXPORT_SYMBOL_GPL(rcu_gp_slow_register); /* Unregister a counter, with NULL for not caring which. */ void rcu_gp_slow_unregister(atomic_t *rgssp) { - WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress); + WARN_ON_ONCE(rgssp && rgssp != rcu_gp_slow_suppress && rcu_gp_slow_suppress != NULL); WRITE_ONCE(rcu_gp_slow_suppress, NULL); } From d8d5b7bf6f2105883bbd91bbd4d5b67e4e3dff71 Mon Sep 17 00:00:00 2001 From: Denis Arefev Date: Mon, 4 Sep 2023 15:21:14 +0300 Subject: [PATCH 23/33] srcu: Fix srcu_struct node grpmask overflow on 64-bit systems The value of a bitwise expression 1 << (cpu - sdp->mynode->grplo) is subject to overflow due to a failure to cast operands to a larger data type before performing the bitwise operation. The maximum result of this subtraction is defined by the RCU_FANOUT_LEAF Kconfig option, which on 64-bit systems defaults to 16 (resulting in a maximum shift of 15), but which can be set up as high as 64 (resulting in a maximum shift of 63). A value of 31 can result in sign extension, resulting in 0xffffffff80000000 instead of the desired 0x80000000. A value of 32 or greater triggers undefined behavior per the C standard. This bug has not been known to cause issues because almost all kernels take the default CONFIG_RCU_FANOUT_LEAF=16. Furthermore, as long as a given compiler gives a deterministic non-zero result for 1<=32, the code correctly invokes all SRCU callbacks, albeit wasting CPU time along the way. This commit therefore substitutes the correct 1UL for the buggy 1. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Denis Arefev Reviewed-by: Mathieu Desnoyers Reviewed-by: Joel Fernandes (Google) Cc: David Laight Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 833a8f848a90..5602042856b1 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -223,7 +223,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) snp->grplo = cpu; snp->grphi = cpu; } - sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo); } smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); return true; @@ -835,7 +835,7 @@ static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp int cpu; for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { - if (!(mask & (1 << (cpu - snp->grplo)))) + if (!(mask & (1UL << (cpu - snp->grplo)))) continue; srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); } From 5f98fd034ca6fd1ab8c91a3488968a0e9caaabf6 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sat, 30 Sep 2023 17:46:56 +0000 Subject: [PATCH 24/33] rcu: kmemleak: Ignore kmemleak false positives when RCU-freeing objects Since the actual slab freeing is deferred when calling kvfree_rcu(), so is the kmemleak_free() callback informing kmemleak of the object deletion. From the perspective of the kvfree_rcu() caller, the object is freed and it may remove any references to it. Since kmemleak does not scan RCU internal data storing the pointer, it will report such objects as leaks during the grace period. Tell kmemleak to ignore such objects on the kvfree_call_rcu() path. Note that the tiny RCU implementation does not have such issue since the objects can be tracked from the rcu_ctrlblk structure. Signed-off-by: Catalin Marinas Reported-by: Christoph Paasch Closes: https://lore.kernel.org/all/F903A825-F05F-4B77-A2B5-7356282FBA2C@apple.com/ Cc: Tested-by: Christoph Paasch Reviewed-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a83ecab77917..4dd7df30df31 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -3389,6 +3390,14 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) success = true; } + /* + * The kvfree_rcu() caller considers the pointer freed at this point + * and likely removes any references to it. Since the actual slab + * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore + * this object (no scanning or false positives reporting). + */ + kmemleak_ignore(ptr); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) schedule_delayed_monitor_work(krcp); From 7df2a2a024145d0dcc1e51dc378c527000b35b07 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:35:54 +0200 Subject: [PATCH 25/33] rcu: Use rcu_segcblist_segempty() instead of open coding it This makes the code more readable. Reviewed-by: Qiuxu Zhuo Reviewed-by: Joel Fernandes (Google) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/rcu_segcblist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index f71fac422c8f..1693ea22ef1b 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -368,7 +368,7 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1]) + if (!rcu_segcblist_segempty(rsclp, i)) break; rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); @@ -551,7 +551,7 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) * as their ->gp_seq[] grace-period completion sequence number. */ for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) - if (rsclp->tails[i] != rsclp->tails[i - 1] && + if (!rcu_segcblist_segempty(rsclp, i) && ULONG_CMP_LT(rsclp->gp_seq[i], seq)) break; From 358662a9616c5078dc4d389d6bceeb5974f4aa97 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:35:58 +0200 Subject: [PATCH 26/33] rcu: Assume IRQS disabled from rcu_report_dead() rcu_report_dead() is the last RCU word from the CPU down through the hotplug path. It is called in the idle loop right before the CPU shuts down for good. Because it removes the CPU from the grace period state machine and reports an ultimate quiescent state if necessary, no further use of RCU is allowed. Therefore it is expected that IRQs are disabled upon calling this function and are not to be re-enabled again until the CPU shuts down. Remove the IRQs disablement from that function and verify instead that it is actually called with IRQs disabled as it is expected at that special point in the idle path. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4dd7df30df31..8c2954502e55 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4562,11 +4562,16 @@ void rcu_cpu_starting(unsigned int cpu) */ void rcu_report_dead(unsigned int cpu) { - unsigned long flags, seq_flags; + unsigned long flags; unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + /* + * IRQS must be disabled from now on and until the CPU dies, or an interrupt + * may introduce a new READ-side while it is actually off the QS masks. + */ + lockdep_assert_irqs_disabled(); // Do any dangling deferred wakeups. do_nocb_deferred_wakeup(rdp); @@ -4574,7 +4579,6 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - local_irq_save(seq_flags); arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4588,8 +4592,6 @@ void rcu_report_dead(unsigned int cpu) WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); arch_spin_unlock(&rcu_state.ofl_lock); - local_irq_restore(seq_flags); - rdp->cpu_started = false; } From c964c1f5ee96e1460606d44f80a47bdacd8fe568 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:35:59 +0200 Subject: [PATCH 27/33] rcu: Assume rcu_report_dead() is always called locally rcu_report_dead() has to be called locally by the CPU that is going to exit the RCU state machine. Passing a cpu argument here is error-prone and leaves the possibility for a racy remote call. Use local access instead. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- arch/arm64/kernel/smp.c | 2 +- include/linux/rcupdate.h | 2 +- kernel/cpu.c | 2 +- kernel/rcu/tree.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 960b98b43506..8fa646c90c67 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -401,7 +401,7 @@ void __noreturn cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); - rcu_report_dead(cpu); + rcu_report_dead(); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { update_cpu_boot_status(CPU_KILL_ME); diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 5e5f920ade90..aa351ddcbe8d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -122,7 +122,7 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -void rcu_report_dead(unsigned int cpu); +void rcu_report_dead(void); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC diff --git a/kernel/cpu.c b/kernel/cpu.c index 6de7c6bb74ee..076e75fed8bb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1388,7 +1388,7 @@ void cpuhp_report_idle_dead(void) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - rcu_report_dead(smp_processor_id()); + rcu_report_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* * We cannot call complete after rcu_report_dead() so we delegate it diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8c2954502e55..2e1e7eadf2cc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4560,11 +4560,11 @@ void rcu_cpu_starting(unsigned int cpu) * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. */ -void rcu_report_dead(unsigned int cpu) +void rcu_report_dead(void) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ /* From a9930e85290ef0a3ebefc90b30638722db3a947c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:36:02 +0200 Subject: [PATCH 28/33] rcu: Remove references to rcu_migrate_callbacks() from diagrams This function is gone since: 53b46303da84 (rcu: Remove rsp parameter from rcu_boot_init_percpu_data() and friends) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../Design/Memory-Ordering/TreeRCU-callback-registry.svg | 9 --------- Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 9 --------- 2 files changed, 18 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg index 7ac6f9269806..63eff867175a 100644 --- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-callback-registry.svg @@ -564,15 +564,6 @@ font-size="192" id="text202-7-9-6" style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_migrate_callbacks() - rcu_migrate_callbacks() rcutree_migrate_callbacks() - rcu_migrate_callbacks() Date: Fri, 8 Sep 2023 22:36:00 +0200 Subject: [PATCH 29/33] rcu: Conditionally build CPU-hotplug teardown callbacks Among the three CPU-hotplug teardown RCU callbacks, two of them early exit if CONFIG_HOTPLUG_CPU=n, and one is left unchanged. In any case all of them have an implementation when CONFIG_HOTPLUG_CPU=n. Align instead with the common way to deal with CPU-hotplug teardown callbacks and provide a proper stub when they are not supported. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/rcutree.h | 13 +++-- kernel/rcu/tree.c | 114 +++++++++++++++++++--------------------- 2 files changed, 64 insertions(+), 63 deletions(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 153cfc7bbffd..46875c4e9f56 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -110,9 +110,16 @@ void rcu_all_qs(void); /* RCUtree hotplug events */ int rcutree_prepare_cpu(unsigned int cpu); int rcutree_online_cpu(unsigned int cpu); -int rcutree_offline_cpu(unsigned int cpu); -int rcutree_dead_cpu(unsigned int cpu); -int rcutree_dying_cpu(unsigned int cpu); void rcu_cpu_starting(unsigned int cpu); +#ifdef CONFIG_HOTPLUG_CPU +int rcutree_dead_cpu(unsigned int cpu); +int rcutree_dying_cpu(unsigned int cpu); +int rcutree_offline_cpu(unsigned int cpu); +#else +#define rcutree_dead_cpu NULL +#define rcutree_dying_cpu NULL +#define rcutree_offline_cpu NULL +#endif + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2e1e7eadf2cc..f9c6b2680cbb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4237,25 +4237,6 @@ static bool rcu_init_invoked(void) return !!rcu_state.n_online_cpus; } -/* - * Near the end of the offline process. Trace the fact that this CPU - * is going offline. - */ -int rcutree_dying_cpu(unsigned int cpu) -{ - bool blkd; - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - struct rcu_node *rnp = rdp->mynode; - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); - trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), - blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); - return 0; -} - /* * All CPUs for the specified rcu_node structure have gone offline, * and all tasks that were preempted within an RCU read-side critical @@ -4301,23 +4282,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } } -/* - * The CPU has been completely removed, and some other CPU is reporting - * this fact from process context. Do the remainder of the cleanup. - * There can only be one CPU hotplug operation at a time, so no need for - * explicit locking. - */ -int rcutree_dead_cpu(unsigned int cpu) -{ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return 0; - - WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); - // Stop-machine done, so allow nohz_full to disable tick. - tick_dep_clear(TICK_DEP_BIT_RCU); - return 0; -} - /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller @@ -4470,29 +4434,6 @@ int rcutree_online_cpu(unsigned int cpu) return 0; } -/* - * Near the beginning of the process. The CPU is still very much alive - * with pretty much all services enabled. - */ -int rcutree_offline_cpu(unsigned int cpu) -{ - unsigned long flags; - struct rcu_data *rdp; - struct rcu_node *rnp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - rnp->ffmask &= ~rdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - rcutree_affinity_setting(cpu, cpu); - - // nohz_full CPUs need the tick for stop-machine to work quickly - tick_dep_set(TICK_DEP_BIT_RCU); - return 0; -} - /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that @@ -4646,7 +4587,60 @@ void rcutree_migrate_callbacks(int cpu) cpu, rcu_segcblist_n_cbs(&rdp->cblist), rcu_segcblist_first_cb(&rdp->cblist)); } -#endif + +/* + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no need for + * explicit locking. + */ +int rcutree_dead_cpu(unsigned int cpu) +{ + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); + // Stop-machine done, so allow nohz_full to disable tick. + tick_dep_clear(TICK_DEP_BIT_RCU); + return 0; +} + +/* + * Near the end of the offline process. Trace the fact that this CPU + * is going offline. + */ +int rcutree_dying_cpu(unsigned int cpu) +{ + bool blkd; + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + struct rcu_node *rnp = rdp->mynode; + + blkd = !!(READ_ONCE(rnp->qsmask) & rdp->grpmask); + trace_rcu_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq), + blkd ? TPS("cpuofl-bgp") : TPS("cpuofl")); + return 0; +} + +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ +int rcutree_offline_cpu(unsigned int cpu) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + rnp->ffmask &= ~rdp->grpmask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + rcutree_affinity_setting(cpu, cpu); + + // nohz_full CPUs need the tick for stop-machine to work quickly + tick_dep_set(TICK_DEP_BIT_RCU); + return 0; +} +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* * On non-huge systems, use expedited RCU grace periods to make suspend From 448e9f34d91d1a4799fdb06a93c2c24b34b6fd9d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:36:01 +0200 Subject: [PATCH 30/33] rcu: Standardize explicit CPU-hotplug calls rcu_report_dead() and rcutree_migrate_callbacks() have their headers in rcupdate.h while those are pure rcutree calls, like the other CPU-hotplug functions. Also rcu_cpu_starting() and rcu_report_dead() have different naming conventions while they mirror each other's effects. Fix the headers and propose a naming that relates both functions and aligns with the prefix of other rcutree CPU-hotplug functions. Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- .../Expedited-Grace-Periods.rst | 2 +- .../RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg | 4 ++-- .../RCU/Design/Memory-Ordering/TreeRCU-gp.svg | 4 ++-- .../RCU/Design/Memory-Ordering/TreeRCU-hotplug.svg | 4 ++-- .../RCU/Design/Requirements/Requirements.rst | 4 ++-- arch/arm64/kernel/smp.c | 4 ++-- arch/powerpc/kernel/smp.c | 2 +- arch/s390/kernel/smp.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- include/linux/interrupt.h | 2 +- include/linux/rcupdate.h | 2 -- include/linux/rcutiny.h | 2 +- include/linux/rcutree.h | 7 ++++++- kernel/cpu.c | 6 +++--- kernel/rcu/tree.c | 12 ++++++++---- 15 files changed, 33 insertions(+), 26 deletions(-) diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst index 93d899d53258..414f8a2012d6 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst @@ -181,7 +181,7 @@ operations is carried out at several levels: of this wait (or series of waits, as the case may be) is to permit a concurrent CPU-hotplug operation to complete. #. In the case of RCU-sched, one of the last acts of an outgoing CPU is - to invoke ``rcu_report_dead()``, which reports a quiescent state for + to invoke ``rcutree_report_cpu_dead()``, which reports a quiescent state for that CPU. However, this is likely paranoia-induced redundancy. +-----------------------------------------------------------------------+ diff --git a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg index 7ddc094d7f28..d82a77d03d8c 100644 --- a/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg +++ b/Documentation/RCU/Design/Memory-Ordering/TreeRCU-gp-fqs.svg @@ -1135,7 +1135,7 @@ font-weight="bold" font-size="192" id="text202-7-5-3-27-6-5" - style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcu_report_dead() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead() rcu_cpu_starting() + xml:space="preserve">rcutree_report_cpu_starting() rcu_report_dead() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead() rcu_cpu_starting() + xml:space="preserve">rcutree_report_cpu_starting() rcu_report_dead() + style="font-size:192px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;stroke-width:0.025in;font-family:Courier">rcutree_report_cpu_dead() rcu_cpu_starting() + xml:space="preserve">rcutree_report_cpu_starting() setup_cpu) diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index a4edb7ea66ea..214a1b67f80a 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -898,7 +898,7 @@ static void smp_start_secondary(void *cpuvoid) S390_lowcore.restart_flags = 0; restore_access_regs(S390_lowcore.access_regs_save_area); cpu_init(); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); init_cpu_timer(); vtime_init(); vdso_getcpu_init(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4e45ff44aa07..4ccb76f89af8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -288,7 +288,7 @@ static void notrace start_secondary(void *unused) cpu_init(); fpu__init_cpu(); - rcu_cpu_starting(raw_smp_processor_id()); + rcutree_report_cpu_starting(raw_smp_processor_id()); x86_cpuinit.early_percpu_clock_init(); ap_starting(); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a92bce40b04b..d05e1e9a553c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -566,7 +566,7 @@ enum * * _ RCU: * 1) rcutree_migrate_callbacks() migrates the queue. - * 2) rcu_report_dead() reports the final quiescent states. + * 2) rcutree_report_cpu_dead() reports the final quiescent states. * * _ IRQ_POLL: irq_poll_cpu_dead() migrates the queue */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index aa351ddcbe8d..f7206b2623c9 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -122,8 +122,6 @@ static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); -void rcu_report_dead(void); -void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void rcu_init_tasks_generic(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 7b949292908a..d9ac7b136aea 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -171,6 +171,6 @@ static inline void rcu_all_qs(void) { barrier(); } #define rcutree_offline_cpu NULL #define rcutree_dead_cpu NULL #define rcutree_dying_cpu NULL -static inline void rcu_cpu_starting(unsigned int cpu) { } +static inline void rcutree_report_cpu_starting(unsigned int cpu) { } #endif /* __LINUX_RCUTINY_H */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 46875c4e9f56..254244202ea9 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -110,7 +110,7 @@ void rcu_all_qs(void); /* RCUtree hotplug events */ int rcutree_prepare_cpu(unsigned int cpu); int rcutree_online_cpu(unsigned int cpu); -void rcu_cpu_starting(unsigned int cpu); +void rcutree_report_cpu_starting(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int rcutree_dead_cpu(unsigned int cpu); @@ -122,4 +122,9 @@ int rcutree_offline_cpu(unsigned int cpu); #define rcutree_offline_cpu NULL #endif +void rcutree_migrate_callbacks(int cpu); + +/* Called from hotplug and also arm64 early secondary boot failure */ +void rcutree_report_cpu_dead(void); + #endif /* __LINUX_RCUTREE_H */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 076e75fed8bb..2491766e1fd5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1388,10 +1388,10 @@ void cpuhp_report_idle_dead(void) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); BUG_ON(st->state != CPUHP_AP_OFFLINE); - rcu_report_dead(); + rcutree_report_cpu_dead(); st->state = CPUHP_AP_IDLE_DEAD; /* - * We cannot call complete after rcu_report_dead() so we delegate it + * We cannot call complete after rcutree_report_cpu_dead() so we delegate it * to an online cpu. */ smp_call_function_single(cpumask_first(cpu_online_mask), @@ -1617,7 +1617,7 @@ void notify_cpu_starting(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ + rcutree_report_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c6b2680cbb..36d8818eaec1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4216,7 +4216,7 @@ bool rcu_lockdep_current_cpu_online(void) rdp = this_cpu_ptr(&rcu_data); /* * Strictly, we care here about the case where the current CPU is - * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask + * in rcutree_report_cpu_starting() and thus has an excuse for rdp->grpmask * not being up to date. So arch_spin_is_locked() might have a * false positive if it's held by some *other* CPU, but that's * OK because that just means a false *negative* on the warning. @@ -4445,8 +4445,10 @@ int rcutree_online_cpu(unsigned int cpu) * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. * This incoming CPU must not have enabled interrupts yet. + * + * This mirrors the effects of rcutree_report_cpu_dead(). */ -void rcu_cpu_starting(unsigned int cpu) +void rcutree_report_cpu_starting(unsigned int cpu) { unsigned long mask; struct rcu_data *rdp; @@ -4500,8 +4502,10 @@ void rcu_cpu_starting(unsigned int cpu) * Note that this function is special in that it is invoked directly * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. + * + * This mirrors the effect of rcutree_report_cpu_starting(). */ -void rcu_report_dead(void) +void rcutree_report_cpu_dead(void) { unsigned long flags; unsigned long mask; @@ -5072,7 +5076,7 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); WARN_ON(num_online_cpus() > 1); // Only one CPU this early in boot. rcutree_prepare_cpu(cpu); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); rcutree_online_cpu(cpu); /* Create workqueue for Tree SRCU and for expedited GPs. */ From a28ab03b499601c14b8f502d875c2bee23209659 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 8 Sep 2023 22:36:03 +0200 Subject: [PATCH 31/33] rcu: Comment why callbacks migration can't wait for CPUHP_RCUTREE_PREP The callbacks migration is performed through an explicit call from the hotplug control CPU right after the death of the target CPU and before proceeding with the CPUHP_ teardown functions. This is unusual but necessary and yet uncommented. Summarize the reason as explained in the changelog of: a58163d8ca2c (rcu: Migrate callbacks earlier in the CPU-offline timeline) Reviewed-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/cpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 2491766e1fd5..3b9d5c7eb4a2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1372,7 +1372,14 @@ static int takedown_cpu(unsigned int cpu) cpuhp_bp_sync_dead(cpu); tick_cleanup_dead_cpu(cpu); + + /* + * Callbacks must be re-integrated right away to the RCU state machine. + * Otherwise an RCU callback could block a further teardown function + * waiting for its completion. + */ rcutree_migrate_callbacks(cpu); + return 0; } From 4a8e65b0c348e42107c64381e692e282900be361 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:28:59 +0200 Subject: [PATCH 32/33] srcu: Fix callbacks acceleration mishandling SRCU callbacks acceleration might fail if the preceding callbacks advance also fails. This can happen when the following steps are met: 1) The RCU_WAIT_TAIL segment has callbacks (say for gp_num 8) and the RCU_NEXT_READY_TAIL also has callbacks (say for gp_num 12). 2) The grace period for RCU_WAIT_TAIL is observed as started but not yet completed so rcu_seq_current() returns 4 + SRCU_STATE_SCAN1 = 5. 3) This value is passed to rcu_segcblist_advance() which can't move any segment forward and fails. 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. But then the call to rcu_seq_snap() observes the grace period for the RCU_WAIT_TAIL segment (gp_num 8) as completed and the subsequent one for the RCU_NEXT_READY_TAIL segment as started (ie: 8 + SRCU_STATE_SCAN1 = 9) so it returns a snapshot of the next grace period, which is 16. 5) The value of 16 is passed to rcu_segcblist_accelerate() but the freshly enqueued callback in RCU_NEXT_TAIL can't move to RCU_NEXT_READY_TAIL which already has callbacks for a previous grace period (gp_num = 12). So acceleration fails. 6) Note in all these steps, srcu_invoke_callbacks() hadn't had a chance to run srcu_invoke_callbacks(). Then some very bad outcome may happen if the following happens: 7) Some other CPU races and starts the grace period number 16 before the CPU handling previous steps had a chance. Therefore srcu_gp_start() isn't called on the latter sdp to fix the acceleration leak from previous steps with a new pair of call to advance/accelerate. 8) The grace period 16 completes and srcu_invoke_callbacks() is finally called. All the callbacks from previous grace periods (8 and 12) are correctly advanced and executed but callbacks in RCU_NEXT_READY_TAIL still remain. Then rcu_segcblist_accelerate() is called with a snaphot of 20. 9) Since nothing started the grace period number 20, callbacks stay unhandled. This has been reported in real load: [3144162.608392] INFO: task kworker/136:12:252684 blocked for more than 122 seconds. [3144162.615986] Tainted: G O K 5.4.203-1-tlinux4-0011.1 #1 [3144162.623053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [3144162.631162] kworker/136:12 D 0 252684 2 0x90004000 [3144162.631189] Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm] [3144162.631192] Call Trace: [3144162.631202] __schedule+0x2ee/0x660 [3144162.631206] schedule+0x33/0xa0 [3144162.631209] schedule_timeout+0x1c4/0x340 [3144162.631214] ? update_load_avg+0x82/0x660 [3144162.631217] ? raw_spin_rq_lock_nested+0x1f/0x30 [3144162.631218] wait_for_completion+0x119/0x180 [3144162.631220] ? wake_up_q+0x80/0x80 [3144162.631224] __synchronize_srcu.part.19+0x81/0xb0 [3144162.631226] ? __bpf_trace_rcu_utilization+0x10/0x10 [3144162.631227] synchronize_srcu+0x5f/0xc0 [3144162.631236] irqfd_shutdown+0x3c/0xb0 [kvm] [3144162.631239] ? __schedule+0x2f6/0x660 [3144162.631243] process_one_work+0x19a/0x3a0 [3144162.631244] worker_thread+0x37/0x3a0 [3144162.631247] kthread+0x117/0x140 [3144162.631247] ? process_one_work+0x3a0/0x3a0 [3144162.631248] ? __kthread_cancel_work+0x40/0x40 [3144162.631250] ret_from_fork+0x1f/0x30 Fix this with taking the snapshot for acceleration _before_ the read of the current grace period number. The only side effect of this solution is that callbacks advancing happen then _after_ the full barrier in rcu_seq_snap(). This is not a problem because that barrier only cares about: 1) Ordering accesses of the update side before call_srcu() so they don't bleed. 2) See all the accesses prior to the grace period of the current gp_num The only things callbacks advancing need to be ordered against are carried by snp locking. Reported-by: Yong He Co-developed-by:: Yong He Signed-off-by: Yong He Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Neeraj upadhyay Signed-off-by: Neeraj upadhyay Link: http://lore.kernel.org/CANZk6aR+CqZaqmMWrC2eRRPY12qAZnDZLwLnHZbNi=xXMB401g@mail.gmail.com Fixes: da915ad5cf25 ("srcu: Parallelize callback handling") Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5602042856b1..9fab9ac36996 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1244,10 +1244,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, spin_lock_irqsave_sdp_contention(sdp, &flags); if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + /* + * The snapshot for acceleration must be taken _before_ the read of the + * current gp sequence used for advancing, otherwise advancing may fail + * and acceleration may then fail too. + * + * This could happen if: + * + * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the + * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). + * + * 2) The grace period for RCU_WAIT_TAIL is seen as started but not + * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. + * + * 3) This value is passed to rcu_segcblist_advance() which can't move + * any segment forward and fails. + * + * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + * But then the call to rcu_seq_snap() observes the grace period for the + * RCU_WAIT_TAIL segment as completed and the subsequent one for the + * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) + * so it returns a snapshot of the next grace period, which is X + 12. + * + * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the + * freshly enqueued callback in RCU_NEXT_TAIL can't move to + * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + * period (gp_num = X + 8). So acceleration fails. + */ + s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp); if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { sdp->srcu_gp_seq_needed = s; needgp = true; From 8a77f38bcd28d3c22ab7dd8eff3f299d43c00411 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Oct 2023 01:29:00 +0200 Subject: [PATCH 33/33] srcu: Only accelerate on enqueue time Acceleration in SRCU happens on enqueue time for each new callback. This operation is expected not to fail and therefore any similar attempt from other places shouldn't find any remaining callbacks to accelerate. Moreover accelerations performed beyond enqueue time are error prone because rcu_seq_snap() then may return the snapshot for a new grace period that is not going to be started. Remove these dangerous and needless accelerations and introduce instead assertions reporting leaking unaccelerated callbacks beyond enqueue time. Co-developed-by: Yong He Signed-off-by: Yong He Co-developed-by: Joel Fernandes (Google) Signed-off-by: Joel Fernandes (Google) Co-developed-by: Neeraj upadhyay Signed-off-by: Neeraj upadhyay Reviewed-by: Like Xu Signed-off-by: Frederic Weisbecker --- kernel/rcu/srcutree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9fab9ac36996..560e99ec5333 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -784,8 +784,7 @@ static void srcu_gp_start(struct srcu_struct *ssp) spin_lock_rcu_node(sdp); /* Interrupts already disabled. */ rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); @@ -1721,6 +1720,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) ssp = sdp->ssp; rcu_cblist_init(&ready_cbs); spin_lock_irq_rcu_node(sdp); + WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || @@ -1750,8 +1750,6 @@ static void srcu_invoke_callbacks(struct work_struct *work) */ spin_lock_irq_rcu_node(sdp); rcu_segcblist_add_len(&sdp->srcu_cblist, -len); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, - rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); spin_unlock_irq_rcu_node(sdp);