From 11e46f0804c406ff5cd67f5ed8387ee8a2f30b8a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Jun 2021 11:14:19 -0700 Subject: [PATCH 01/46] torture: Apply CONFIG_KCSAN_STRICT to kvm.sh --kcsan argument Currently, the --kcsan argument to kvm.sh applies a laundry list of Kconfig options. Now that KCSAN provides the CONFIG_KCSAN_STRICT Kconfig option, this commit reduces the laundry list to this one option. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index f442d84fb2a3..6cf9ec6a3d1c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -184,7 +184,7 @@ do TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG ;; --kcsan) - TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_INTERRUPT_WATCHER=y CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG + TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_STRICT=y CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_DEBUG_LOCK_ALLOC=y CONFIG_PROVE_LOCKING=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' From 2431774f04d1050292054c763070021bade7b151 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jul 2021 06:16:27 -0700 Subject: [PATCH 02/46] rcu: Mark accesses to rcu_state.n_force_qs This commit marks accesses to the rcu_state.n_force_qs. These data races are hard to make happen, but syzkaller was equal to the task. Reported-by: syzbot+e08a83a1940ec3846cd5@syzkaller.appspotmail.com Acked-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bce848e50512..c89f5e6c4154 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1907,7 +1907,7 @@ static void rcu_gp_fqs(bool first_time) struct rcu_node *rnp = rcu_get_root(); WRITE_ONCE(rcu_state.gp_activity, jiffies); - rcu_state.n_force_qs++; + WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1); if (first_time) { /* Collect dyntick-idle snapshots. */ force_qs_rnp(dyntick_save_progress_counter); @@ -2550,7 +2550,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rcu_state.n_force_qs; + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; @@ -2898,10 +2898,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, } else { /* Give the grace period a kick. */ rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; - if (rcu_state.n_force_qs == rdp->n_force_qs_snap && + if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap && rcu_segcblist_first_pend_cb(&rdp->cblist) != head) rcu_force_quiescent_state(); - rdp->n_force_qs_snap = rcu_state.n_force_qs; + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } @@ -4128,7 +4128,7 @@ int rcutree_prepare_cpu(unsigned int cpu) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; - rdp->n_force_qs_snap = rcu_state.n_force_qs; + rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); From 52b030aa278642194f5d25872c33360013b0167e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Jul 2021 13:24:35 -0700 Subject: [PATCH 03/46] rcu-nocb: Fix a couple of tree_nocb code-style nits This commit removes a non-value-returning "return" statement at the end of __call_rcu_nocb_wake() and adds a blank line following declarations in nocb_cb_can_run(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 8fdf44f8523f..368ef7b9af4f 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -549,7 +549,6 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock_irqrestore(rdp, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } - return; } /* @@ -767,6 +766,7 @@ static int rcu_nocb_gp_kthread(void *arg) static inline bool nocb_cb_can_run(struct rcu_data *rdp) { u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); } From 88ee23ef1c129e40309f4612f80dd74be4590c03 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jul 2021 15:49:05 -0700 Subject: [PATCH 04/46] rcu: Eliminate rcu_implicit_dynticks_qs() local variable rnhqp The rcu_implicit_dynticks_qs() function's local variable rnhqp references the ->rcu_need_heavy_qs field in the rcu_data structure referenced by the function parameter rdp, with a rather odd method for computing the pointer to this field. This commit therefore simplifies things and saves a few lines of code by replacing each instance of rnhqp with &rdp->need_heavy_qs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c89f5e6c4154..18d2f35d1450 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1219,7 +1219,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; - bool *rnhqp; bool *ruqp; struct rcu_node *rnp = rdp->mynode; @@ -1286,12 +1285,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) */ jtsq = READ_ONCE(jiffies_to_sched_qs); ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); - rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu); - if (!READ_ONCE(*rnhqp) && + if (!READ_ONCE(rdp->rcu_need_heavy_qs) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || time_after(jiffies, rcu_state.jiffies_resched) || rcu_state.cbovld)) { - WRITE_ONCE(*rnhqp, true); + WRITE_ONCE(rdp->rcu_need_heavy_qs, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ smp_store_release(ruqp, true); } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) { From 9424b867a759febc2b67b6777bfa27f0f830d437 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jul 2021 16:47:42 -0700 Subject: [PATCH 05/46] rcu: Eliminate rcu_implicit_dynticks_qs() local variable ruqp The rcu_implicit_dynticks_qs() function's local variable ruqp references the ->rcu_urgent_qs field in the rcu_data structure referenced by the function parameter rdp, with a rather odd method for computing the pointer to this field. This commit therefore simplifies things and saves a couple of lines of code by replacing each instance of ruqp with &rdp->need_heavy_qs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 18d2f35d1450..0bfebec94277 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1219,7 +1219,6 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { unsigned long jtsq; - bool *ruqp; struct rcu_node *rnp = rdp->mynode; /* @@ -1284,16 +1283,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * is set way high. */ jtsq = READ_ONCE(jiffies_to_sched_qs); - ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu); if (!READ_ONCE(rdp->rcu_need_heavy_qs) && (time_after(jiffies, rcu_state.gp_start + jtsq * 2) || time_after(jiffies, rcu_state.jiffies_resched) || rcu_state.cbovld)) { WRITE_ONCE(rdp->rcu_need_heavy_qs, true); /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ - smp_store_release(ruqp, true); + smp_store_release(&rdp->rcu_urgent_qs, true); } else if (time_after(jiffies, rcu_state.gp_start + jtsq)) { - WRITE_ONCE(*ruqp, true); + WRITE_ONCE(rdp->rcu_urgent_qs, true); } /* @@ -1307,7 +1305,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (tick_nohz_full_cpu(rdp->cpu) && (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) || rcu_state.cbovld)) { - WRITE_ONCE(*ruqp, true); + WRITE_ONCE(rdp->rcu_urgent_qs, true); resched_cpu(rdp->cpu); WRITE_ONCE(rdp->last_fqs_resched, jiffies); } From 13bc8fa8057a064007d4e69c12799450123ef731 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jul 2021 21:41:48 -0700 Subject: [PATCH 06/46] doc: Add another stall-warning root cause in stallwarn.rst This commit adds a bullet item noting that both deficiencies and surpluses of calls to rcu_*_enter() and rcu_*_exit() can result in RCU CPU stall warnings. Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index 5036df24ae61..28f8ad16db25 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -96,6 +96,16 @@ warnings: the ``rcu_.*timer wakeup didn't happen for`` console-log message, which will include additional debugging information. +- A low-level kernel issue that either fails to invoke one of the + variants of rcu_user_enter(), rcu_user_exit(), rcu_idle_enter(), + rcu_idle_exit(), rcu_irq_enter(), or rcu_irq_exit() on the one + hand, or that invokes one of them too many times on the other. + Historically, the most frequent issue has been an omission + of either irq_enter() or irq_exit(), which in turn invoke + rcu_irq_enter() or rcu_irq_exit(), respectively. Building your + kernel with CONFIG_RCU_EQS_DEBUG=y can help track down these types + of issues, which sometimes arise in architecture-specific code. + - A bug in the RCU implementation. - A hardware failure. This is quite unlikely, but has occurred From 3ac858785231e2573d2964950738aac087ebf30c Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Mon, 26 Jul 2021 05:43:33 +0800 Subject: [PATCH 07/46] rcu: Fix undefined Kconfig macros Invoking scripts/checkkconfigsymbols.py in the Linux-kernel source tree located the following issues: 1. TREE_PREEMPT_RCU Referencing files: arch/sh/configs/sdk7786_defconfig It should now be CONFIG_PREEMPT_RCU. Except that the CONFIG_PREEMPT=y in that same file implies CONFIG_PREEMPT_RCU=y. Therefore, delete the CONFIG_TREE_PREEMPT_RCU=y line. The reason is as follows: In kernel/rcu/Kconfig, we have config PREEMPT_RCU bool default y if PREEMPTION https://www.kernel.org/doc/Documentation/kbuild/kconfig-language.txt says, "The default value is only assigned to the config symbol if no other value was set by the user (via the input prompt above)." there is no prompt in config PREEMPT_RCU entry, so we are guaranteed to get CONFIG_PREEMPT_RCU=y when CONFIG_PREEMPT is present. 2. RCU_CPU_STALL_INFO Referencing files: arch/xtensa/configs/nommu_kc705_defconfig The old Kconfig option RCU_CPU_STALL_INFO was removed by commit 75c27f119b64 ("rcu: Remove CONFIG_RCU_CPU_STALL_INFO"), and the kernel now acts as if this Kconfig option was unconditionally enabled. 3. RCU_NOCB_CPU_ALL Referencing files: Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst This is an old snapshot of the code. I update this from the real rcu_prepare_for_idle() function in kernel/rcu/tree_plugin.h. This change was tested by invoking "make htmldocs". 4. RCU_TORTURE_TESTS Referencing files: kernel/rcu/rcutorture.c Forward-progress checking conflicts with CPU-stall testing, so we should complain at "modprobe rcutorture" when both are enabled. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- .../Tree-RCU-Memory-Ordering.rst | 69 +++++++++---------- arch/sh/configs/sdk7786_defconfig | 1 - arch/xtensa/configs/nommu_kc705_defconfig | 1 - kernel/rcu/rcutorture.c | 2 +- 4 files changed, 33 insertions(+), 40 deletions(-) diff --git a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst index eeb351296df1..7fdf151a8680 100644 --- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst +++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst @@ -202,49 +202,44 @@ newly arrived RCU callbacks against future grace periods: 1 static void rcu_prepare_for_idle(void) 2 { 3 bool needwake; - 4 struct rcu_data *rdp; - 5 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - 6 struct rcu_node *rnp; - 7 struct rcu_state *rsp; - 8 int tne; - 9 - 10 if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || - 11 rcu_is_nocb_cpu(smp_processor_id())) - 12 return; + 4 struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + 5 struct rcu_node *rnp; + 6 int tne; + 7 + 8 lockdep_assert_irqs_disabled(); + 9 if (rcu_rdp_is_offloaded(rdp)) + 10 return; + 11 + 12 /* Handle nohz enablement switches conservatively. */ 13 tne = READ_ONCE(tick_nohz_active); - 14 if (tne != rdtp->tick_nohz_enabled_snap) { - 15 if (rcu_cpu_has_callbacks(NULL)) - 16 invoke_rcu_core(); - 17 rdtp->tick_nohz_enabled_snap = tne; + 14 if (tne != rdp->tick_nohz_enabled_snap) { + 15 if (!rcu_segcblist_empty(&rdp->cblist)) + 16 invoke_rcu_core(); /* force nohz to see update. */ + 17 rdp->tick_nohz_enabled_snap = tne; 18 return; - 19 } + 19 } 20 if (!tne) 21 return; - 22 if (rdtp->all_lazy && - 23 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { - 24 rdtp->all_lazy = false; - 25 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; - 26 invoke_rcu_core(); - 27 return; - 28 } - 29 if (rdtp->last_accelerate == jiffies) - 30 return; - 31 rdtp->last_accelerate = jiffies; - 32 for_each_rcu_flavor(rsp) { - 33 rdp = this_cpu_ptr(rsp->rda); - 34 if (rcu_segcblist_pend_cbs(&rdp->cblist)) - 35 continue; - 36 rnp = rdp->mynode; - 37 raw_spin_lock_rcu_node(rnp); - 38 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - 39 raw_spin_unlock_rcu_node(rnp); - 40 if (needwake) - 41 rcu_gp_kthread_wake(rsp); - 42 } - 43 } + 22 + 23 /* + 24 * If we have not yet accelerated this jiffy, accelerate all + 25 * callbacks on this CPU. + 26 */ + 27 if (rdp->last_accelerate == jiffies) + 28 return; + 29 rdp->last_accelerate = jiffies; + 30 if (rcu_segcblist_pend_cbs(&rdp->cblist)) { + 31 rnp = rdp->mynode; + 32 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + 33 needwake = rcu_accelerate_cbs(rnp, rdp); + 34 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + 35 if (needwake) + 36 rcu_gp_kthread_wake(); + 37 } + 38 } But the only part of ``rcu_prepare_for_idle()`` that really matters for -this discussion are lines 37–39. We will therefore abbreviate this +this discussion are lines 32–34. We will therefore abbreviate this function as follows: .. kernel-figure:: rcu_node-lock.svg diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index f776a1d0d277..a8662b6927ec 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -5,7 +5,6 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_AUDIT=y CONFIG_AUDITSYSCALL=y -CONFIG_TREE_PREEMPT_RCU=y CONFIG_RCU_TRACE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y diff --git a/arch/xtensa/configs/nommu_kc705_defconfig b/arch/xtensa/configs/nommu_kc705_defconfig index 88b2e222d4bf..fcb620ef3799 100644 --- a/arch/xtensa/configs/nommu_kc705_defconfig +++ b/arch/xtensa/configs/nommu_kc705_defconfig @@ -119,7 +119,6 @@ CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_STACKTRACE=y -# CONFIG_RCU_CPU_STALL_INFO is not set CONFIG_RCU_TRACE=y # CONFIG_FTRACE is not set # CONFIG_LD_NO_RELAX is not set diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ab4215266ebe..f640cbd9262c 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2449,7 +2449,7 @@ static int __init rcu_torture_fwd_prog_init(void) } if (stall_cpu > 0) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing"); - if (IS_MODULE(CONFIG_RCU_TORTURE_TESTS)) + if (IS_MODULE(CONFIG_RCU_TORTURE_TEST)) return -EINVAL; /* In module, can fail back to user. */ WARN_ON(1); /* Make sure rcutorture notices conflict. */ return 0; From ebc88ad491362e6a4fae5bfb1c23c06c876f70be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 26 Jul 2021 11:57:39 -0700 Subject: [PATCH 08/46] rcu: Comment rcu_gp_init() code waiting for CPU-hotplug operations Near the beginning of rcu_gp_init() is a per-rcu_node loop that waits for CPU-hotplug operations that might have started before the new grace period did. This commit adds a comment explaining that this wait does not exclude CPU-hotplug operations. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0bfebec94277..e6e1b9281530 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1775,6 +1775,8 @@ static noinline_for_stack bool rcu_gp_init(void) */ WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); rcu_for_each_leaf_node(rnp) { + // Wait for CPU-hotplug operations that might have + // started before this grace period did. smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. firstseq = READ_ONCE(rnp->ofl_seq); if (firstseq & 0x1) From 2caebefb00f03b5ba13d44aa6cc3723759b43822 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jul 2021 12:38:42 -0700 Subject: [PATCH 09/46] rcu: Move rcu_dynticks_eqs_online() to rcu_cpu_starting() The purpose of rcu_dynticks_eqs_online() is to adjust the ->dynticks counter of an incoming CPU when required. It is currently invoked from rcutree_prepare_cpu(), which runs before the incoming CPU is running, and thus on some other CPU. This makes the per-CPU accesses in rcu_dynticks_eqs_online() iffy at best, and it all "works" only because the running CPU cannot possibly be in dyntick-idle mode, which means that rcu_dynticks_eqs_online() never has any effect. It is currently OK for rcu_dynticks_eqs_online() to have no effect, but only because the CPU-offline process just happens to leave ->dynticks in the correct state. After all, if ->dynticks were in the wrong state on a just-onlined CPU, rcutorture would complain bitterly the next time that CPU went idle, at least in kernels built with CONFIG_RCU_EQS_DEBUG=y, for example, those built by rcutorture scenario TREE04. One could argue that this means that rcu_dynticks_eqs_online() is unnecessary, however, removing it would make the CPU-online process vulnerable to slight changes in the CPU-offline process. One could also ask why it is safe to move the rcu_dynticks_eqs_online() call so late in the CPU-online process. Indeed, there was a time when it would not have been safe, which does much to explain its current location. However, the marking of a CPU as online from an RCU perspective has long since moved from rcutree_prepare_cpu() to rcu_cpu_starting(), and all that is required is that ->dynticks be set correctly by the time that the CPU is marked as online from an RCU perspective. After all, the RCU grace-period kthread does not check to see if offline CPUs are also idle. (In case you were curious, this is one reason why there is quiescent-state reporting as part of the offlining process.) This commit therefore moves the call to rcu_dynticks_eqs_online() from rcutree_prepare_cpu() to rcu_cpu_starting(), this latter being guaranteed to be running on the incoming CPU. The call to this function must of course be placed before this rcu_cpu_starting() announces this CPU's presence to RCU. Reported-by: Mathieu Desnoyers Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e6e1b9281530..801075e36515 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4129,7 +4129,6 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs); rdp->blimit = blimit; rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ - rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* @@ -4249,6 +4248,7 @@ void rcu_cpu_starting(unsigned int cpu) mask = rdp->grpmask; WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + rcu_dynticks_eqs_online(); smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); From 768f5d50e6ad88363291f96a2e230442b8d633bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jul 2021 15:35:21 -0700 Subject: [PATCH 10/46] rcu: Simplify rcu_report_dead() call to rcu_report_exp_rdp() Currently, rcu_report_dead() disables preemption across its call to rcu_report_exp_rdp(), but this is pointless because interrupts are already disabled by the caller. In addition, rcu_report_dead() computes the address of the outgoing CPU's rcu_data structure, which is also pointless because this address is already present in local variable rdp. This commit therefore drops the preemption disabling and passes rdp to rcu_report_exp_rdp(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 801075e36515..dc2968473593 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4294,9 +4294,7 @@ void rcu_report_dead(unsigned int cpu) do_nocb_deferred_wakeup(rdp); /* QS for any half-done expedited grace period. */ - preempt_disable(); - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); - preempt_enable(); + rcu_report_exp_rdp(rdp); rcu_preempt_deferred_qs(current); /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ From 4aa846f97c0c0d9740d120f9ac3e2fba1522ac0c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Jul 2021 20:30:32 -0700 Subject: [PATCH 11/46] rcu: Make rcutree_dying_cpu() use its "cpu" parameter The CPU-hotplug functions take a "cpu" parameter, but rcutree_dying_cpu() ignores it in favor of this_cpu_ptr(). This works at the moment, but it would be better to be consistent. This might also work better given some possible future changes. This commit therefore uses per_cpu_ptr() to avoid ignoring the rcutree_dying_cpu() function's argument. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dc2968473593..6a1e9d3374db 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2356,7 +2356,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp) int rcutree_dying_cpu(unsigned int cpu) { bool blkd; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) From ebb6d30d9ed1fe7137486e1be2ae9d621e918c4a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 10 Aug 2021 10:48:15 +0200 Subject: [PATCH 12/46] rcu: Make rcu_normal_after_boot writable again Certain configurations (e.g., systems that make heavy use of netns) need to use synchronize_rcu_expedited() to service RCU grace periods even after boot. Even though synchronize_rcu_expedited() has been traditionally considered harmful for RT for the heavy use of IPIs, it is perfectly usable under certain conditions (e.g. nohz_full). Make rcupdate.rcu_normal_after_boot= again writeable on RT (if NO_HZ_ FULL is defined), but keep its default value to 1 (enabled) to avoid regressions. Users who need synchronize_rcu_expedited() will boot with rcupdate.rcu_normal_after_ boot=0 in the kernel cmdline. Reflect the change in synchronize_rcu_expedited_wait() by removing the WARN related to CONFIG_PREEMPT_RT. Signed-off-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 1 - kernel/rcu/update.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 2796084ef85a..d9e4f8eb9ae2 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -512,7 +512,6 @@ static void synchronize_rcu_expedited_wait(void) j = READ_ONCE(jiffies_till_first_fqs); if (synchronize_rcu_expedited_wait_once(j + HZ)) return; - WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)); } for (;;) { diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c21b38cc25e9..bd551134e2f4 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -57,7 +57,7 @@ module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); -#ifndef CONFIG_PREEMPT_RT +#if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL) module_param(rcu_normal_after_boot, int, 0); #endif #endif /* #ifndef CONFIG_TINY_RCU */ From 1eac0075ebeecbf5c972f575ac448a0ea92e4f3a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 10 Aug 2021 10:48:16 +0200 Subject: [PATCH 13/46] rcu: Make rcu update module parameters world-readable rcu update module parameters currently don't appear in sysfs and this is a serviceability issue as it might be needed to access their default values at runtime. Fix this issue by changing rcu update module parameters permissions to world-readable. Suggested-by: Paul E. McKenney Signed-off-by: Juri Lelli Signed-off-by: Paul E. McKenney --- kernel/rcu/update.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index bd551134e2f4..94282dc12bab 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -54,11 +54,11 @@ #define MODULE_PARAM_PREFIX "rcupdate." #ifndef CONFIG_TINY_RCU -module_param(rcu_expedited, int, 0); -module_param(rcu_normal, int, 0); +module_param(rcu_expedited, int, 0444); +module_param(rcu_normal, int, 0444); static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); #if !defined(CONFIG_PREEMPT_RT) || defined(CONFIG_NO_HZ_FULL) -module_param(rcu_normal_after_boot, int, 0); +module_param(rcu_normal_after_boot, int, 0444); #endif #endif /* #ifndef CONFIG_TINY_RCU */ From f0b2b2df5423fb369ac762c77900bc7765496d58 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 18 Aug 2021 13:34:00 +0530 Subject: [PATCH 14/46] rcu: Fix existing exp request check in sync_sched_exp_online_cleanup() The sync_sched_exp_online_cleanup() checks to see if RCU needs an expedited quiescent state from the incoming CPU, sending it an IPI if so. Before sending IPI, it checks whether expedited qs need has been already requested for the incoming CPU, by checking rcu_data.cpu_no_qs.b.exp for the current cpu, on which sync_sched_exp_online_cleanup() is running. This works for the case where incoming CPU is same as self. However, for the case where incoming CPU is different from self, expedited request won't get marked, which can potentially delay reporting of expedited quiescent state for the incoming CPU. Fixes: e015a3411220 ("rcu: Avoid self-IPI in sync_sched_exp_online_cleanup()") Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d9e4f8eb9ae2..f3947c49eee7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -759,7 +759,7 @@ static void sync_sched_exp_online_cleanup(int cpu) my_cpu = get_cpu(); /* Quiescent state either not needed or already requested, leave. */ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { + rdp->cpu_no_qs.b.exp) { put_cpu(); return; } From cbe0d8d91415c9692fe88191940d98952b6855d9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Jul 2021 12:17:59 -0700 Subject: [PATCH 15/46] rcu-tasks: Wait for trc_read_check_handler() IPIs Currently, RCU Tasks Trace initializes the trc_n_readers_need_end counter to the value one, increments it before each trc_read_check_handler() IPI, then decrements it within trc_read_check_handler() if the target task was in a quiescent state (or if the target task moved to some other CPU while the IPI was in flight), complaining if the new value was zero. The rationale for complaining is that the initial value of one must be decremented away before zero can be reached, and this decrement has not yet happened. Except that trc_read_check_handler() is initiated with an asynchronous smp_call_function_single(), which might be significantly delayed. This can result in false-positive complaints about the counter reaching zero. This commit therefore waits for in-flight IPI handlers to complete before decrementing away the initial value of one from the trc_n_readers_need_end counter. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 806160c44b17..3b2f8038064a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1150,14 +1150,28 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, } } +static void rcu_tasks_trace_empty_fn(void *unused) +{ +} + /* Wait for grace period to complete and provide ordering. */ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) { + int cpu; bool firstreport; struct task_struct *g, *t; LIST_HEAD(holdouts); long ret; + // Wait for any lingering IPI handlers to complete. Note that + // if a CPU has gone offline or transitioned to userspace in the + // meantime, all IPI handlers should have been drained beforehand. + // Yes, this assumes that CPUs process IPIs in order. If that ever + // changes, there will need to be a recheck and/or timed wait. + for_each_online_cpu(cpu) + if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))) + smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); + // Remove the safety count. smp_mb__before_atomic(); // Order vs. earlier atomics atomic_dec(&trc_n_readers_need_end); From fda84866b1e68ab409074e7fcf1a7db800615445 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 3 Aug 2021 17:42:25 -0700 Subject: [PATCH 16/46] rcutorture: Suppressing read-exit testing is not an error Currently, specifying the rcutorture.read_exit_burst=0 kernel boot parameter will result in a -EINVAL exit code that will stop the rcutorture test run before it has fully initialized. This commit therefore uses a zero exit code in that case, thus allowing rcutorture.read_exit_burst=0 to complete normally. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index ab4215266ebe..59254fa15cc6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2741,7 +2741,7 @@ static int rcu_torture_read_exit(void *unused) static int rcu_torture_read_exit_init(void) { if (read_exit_burst <= 0) - return -EINVAL; + return 0; init_waitqueue_head(&read_exit_wq); read_exit_child_stop = false; read_exit_child_stopped = false; From efeff6b39b9de4480572c7b0c5eb77204795cb57 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 13:28:24 -0700 Subject: [PATCH 17/46] rcutorture: Warn on individual rcu_torture_init() error conditions When running rcutorture as a module, any rcu_torture_init() issues will be reflected in the error code from modprobe or insmod, as the case may be. However, these error codes are not available when running rcutorture built-in, for example, when using the kvm.sh script. This commit therefore adds WARN_ON_ONCE() to allow distinguishing rcu_torture_init() errors when running rcutorture built-in. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 8 ++++++++ kernel/rcu/rcutorture.c | 30 +++++++++++++++--------------- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/include/linux/torture.h b/include/linux/torture.h index 0910c5803f35..24f58e50a94b 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -47,6 +47,14 @@ do { \ } while (0) void verbose_torout_sleep(void); +#define torture_init_error(firsterr) \ +({ \ + int ___firsterr = (firsterr); \ + \ + WARN_ONCE(!IS_MODULE(CONFIG_RCU_TORTURE_TEST) && ___firsterr < 0, "Torture-test initialization failed with error code %d\n", ___firsterr); \ + ___firsterr < 0; \ +}) + /* Definitions for online/offline exerciser. */ #ifdef CONFIG_HOTPLUG_CPU int torture_num_online_cpus(void); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59254fa15cc6..b90cd4d98a20 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3037,7 +3037,7 @@ rcu_torture_init(void) rcu_torture_write_types(); firsterr = torture_create_kthread(rcu_torture_writer, NULL, writer_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; if (nfakewriters > 0) { fakewriter_tasks = kcalloc(nfakewriters, @@ -3052,7 +3052,7 @@ rcu_torture_init(void) for (i = 0; i < nfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, NULL, fakewriter_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), @@ -3068,7 +3068,7 @@ rcu_torture_init(void) rcu_torture_reader_mbchk[i].rtc_chkrdr = -1; firsterr = torture_create_kthread(rcu_torture_reader, (void *)i, reader_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } nrealnocbers = nocbs_nthreads; @@ -3088,18 +3088,18 @@ rcu_torture_init(void) } for (i = 0; i < nrealnocbers; i++) { firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(rcu_torture_stats, NULL, stats_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (test_no_idle_hz && shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval * HZ); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stutter < 0) @@ -3109,7 +3109,7 @@ rcu_torture_init(void) t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; firsterr = torture_stutter_init(stutter * HZ, t); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (fqs_duration < 0) @@ -3118,7 +3118,7 @@ rcu_torture_init(void) /* Create the fqs thread */ firsterr = torture_create_kthread(rcu_torture_fqs, NULL, fqs_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (test_boost_interval < 1) @@ -3132,7 +3132,7 @@ rcu_torture_init(void) firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", rcutorture_booster_init, rcutorture_booster_cleanup); - if (firsterr < 0) + if (torture_init_error(firsterr)) goto unwind; rcutor_hp = firsterr; @@ -3153,23 +3153,23 @@ rcu_torture_init(void) } shutdown_jiffies = jiffies + shutdown_secs * HZ; firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, rcutorture_sync); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; firsterr = rcu_torture_stall_init(); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; firsterr = rcu_torture_fwd_prog_init(); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; firsterr = rcu_torture_barrier_init(); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; firsterr = rcu_torture_read_exit_init(); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; if (object_debug) rcu_test_debug_objects(); From b3b3cc618ee07f5f4c409e8ca86ac7fbac085ebd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 15:53:10 -0700 Subject: [PATCH 18/46] locktorture: Warn on individual lock_torture_init() error conditions When running locktorture as a module, any lock_torture_init() issues will be reflected in the error code from modprobe or insmod, as the case may be. However, these error codes are not available when running locktorture built-in, for example, when using the kvm.sh script. This commit therefore adds WARN_ON_ONCE() to allow distinguishing lock_torture_init() errors when running locktorture built-in. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 7c5a4a087cc7..397ac13d2ef7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1022,23 +1022,23 @@ static int __init lock_torture_init(void) if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ, NULL); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shutdown_secs > 0) { firsterr = torture_shutdown_init(shutdown_secs, lock_torture_cleanup); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stutter > 0) { firsterr = torture_stutter_init(stutter, stutter); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } @@ -1082,7 +1082,7 @@ static int __init lock_torture_init(void) /* Create writer. */ firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], writer_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; create_reader: @@ -1091,13 +1091,13 @@ static int __init lock_torture_init(void) /* Create reader. */ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], reader_tasks[j]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, stats_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } torture_init_end(); From ed60ad733aa49b70720c9d8dded1b18374ec5022 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 15:57:12 -0700 Subject: [PATCH 19/46] refscale: Warn on individual ref_scale_init() error conditions When running refscale as a module, any ref_scale_init() issues will be reflected in the error code from modprobe or insmod, as the case may be. However, these error codes are not available when running refscale built-in, for example, when using the kvm.sh script. This commit therefore adds WARN_ON_ONCE() to allow distinguishing ref_scale_init() errors when running refscale built-in. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 66dc14cf5687..1631ef8a138d 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -824,7 +824,7 @@ ref_scale_init(void) init_waitqueue_head(&shutdown_wq); firsterr = torture_create_kthread(ref_scale_shutdown, NULL, shutdown_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; schedule_timeout_uninterruptible(1); } @@ -851,7 +851,7 @@ ref_scale_init(void) for (i = 0; i < nreaders; i++) { firsterr = torture_create_kthread(ref_scale_reader, (void *)i, reader_tasks[i].task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; init_waitqueue_head(&(reader_tasks[i].wq)); @@ -860,7 +860,7 @@ ref_scale_init(void) // Main Task init_waitqueue_head(&main_wq); firsterr = torture_create_kthread(main_func, NULL, main_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; torture_init_end(); From eb77abfdeed29dd032c923e16fe8d91fa95cd316 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 15:58:53 -0700 Subject: [PATCH 20/46] rcuscale: Warn on individual rcu_scale_init() error conditions When running rcuscale as a module, any rcu_scale_init() issues will be reflected in the error code from modprobe or insmod, as the case may be. However, these error codes are not available when running rcuscale built-in, for example, when using the kvm.sh script. This commit therefore adds WARN_ON_ONCE() to allow distinguishing rcu_scale_init() errors when running rcuscale built-in. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 2cc34a22a506..228f143bf935 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -758,7 +758,7 @@ kfree_scale_init(void) init_waitqueue_head(&shutdown_wq); firsterr = torture_create_kthread(kfree_scale_shutdown, NULL, shutdown_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; schedule_timeout_uninterruptible(1); } @@ -775,7 +775,7 @@ kfree_scale_init(void) for (i = 0; i < kfree_nrealthreads; i++) { firsterr = torture_create_kthread(kfree_scale_thread, (void *)i, kfree_reader_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } @@ -838,7 +838,7 @@ rcu_scale_init(void) init_waitqueue_head(&shutdown_wq); firsterr = torture_create_kthread(rcu_scale_shutdown, NULL, shutdown_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; schedule_timeout_uninterruptible(1); } @@ -852,7 +852,7 @@ rcu_scale_init(void) for (i = 0; i < nrealreaders; i++) { firsterr = torture_create_kthread(rcu_scale_reader, (void *)i, reader_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders) @@ -879,7 +879,7 @@ rcu_scale_init(void) } firsterr = torture_create_kthread(rcu_scale_writer, (void *)i, writer_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } torture_init_end(); From fd13fe16db0d82612b260640f4e26f6d9d1e11fd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Aug 2021 08:57:26 -0700 Subject: [PATCH 21/46] rcutorture: Don't cpuhp_remove_state() if cpuhp_setup_state() failed Currently, in CONFIG_RCU_BOOST kernels, if the rcu_torture_init() function's call to cpuhp_setup_state() fails, rcu_torture_cleanup() gamely passes nonsense to cpuhp_remove_state(). This results in strange and misleading splats. This commit therefore ensures that if the rcu_torture_init() function's call to cpuhp_setup_state() fails, rcu_torture_cleanup() avoids invoking cpuhp_remove_state(). Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index b90cd4d98a20..424184764ef0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2819,7 +2819,7 @@ rcu_torture_cleanup(void) rcutorture_seq_diff(gp_seq, start_gp_seq)); torture_stop_kthread(rcu_torture_stats, stats_task); torture_stop_kthread(rcu_torture_fqs, fqs_task); - if (rcu_torture_can_boost()) + if (rcu_torture_can_boost() && rcutor_hp >= 0) cpuhp_remove_state(rcutor_hp); /* @@ -3132,9 +3132,9 @@ rcu_torture_init(void) firsterr = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "RCU_TORTURE", rcutorture_booster_init, rcutorture_booster_cleanup); + rcutor_hp = firsterr; if (torture_init_error(firsterr)) goto unwind; - rcutor_hp = firsterr; // Testing RCU priority boosting requires rcutorture do // some serious abuse. Counter this by running ksoftirqd From 71921a9606ddbcc1d98c00eca7ae82c373d1fecd Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Fri, 20 Aug 2021 09:42:36 +0200 Subject: [PATCH 22/46] rcutorture: Avoid problematic critical section nesting on PREEMPT_RT rcutorture is generating some nesting scenarios that are not compatible on PREEMPT_RT. For example: preempt_disable(); rcu_read_lock_bh(); preempt_enable(); rcu_read_unlock_bh(); The problem here is that on PREEMPT_RT the bottom halves have to be disabled and enabled in preemptible context. Reorder locking: start with BH locking and continue with then with disabling preemption or interrupts. In the unlocking do it reverse by first enabling interrupts and preemption and BH at the very end. Ensure that on PREEMPT_RT BH locking remains unchanged if in non-preemptible context. Link: https://lkml.kernel.org/r/20190911165729.11178-6-swood@redhat.com Link: https://lkml.kernel.org/r/20210819182035.GF4126399@paulmck-ThinkPad-P17-Gen-1 Signed-off-by: Scott Wood [bigeasy: Drop ATOM_BH, make it only about changing BH in atomic context. Allow enabling RCU in IRQ-off section. Reword commit message.] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 48 ++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 424184764ef0..fb079b78c232 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1432,28 +1432,34 @@ static void rcutorture_one_extend(int *readstate, int newstate, /* First, put new protection in place to avoid critical-section gap. */ if (statesnew & RCUTORTURE_RDR_BH) local_bh_disable(); + if (statesnew & RCUTORTURE_RDR_RBH) + rcu_read_lock_bh(); if (statesnew & RCUTORTURE_RDR_IRQ) local_irq_disable(); if (statesnew & RCUTORTURE_RDR_PREEMPT) preempt_disable(); - if (statesnew & RCUTORTURE_RDR_RBH) - rcu_read_lock_bh(); if (statesnew & RCUTORTURE_RDR_SCHED) rcu_read_lock_sched(); if (statesnew & RCUTORTURE_RDR_RCU) idxnew = cur_ops->readlock() << RCUTORTURE_RDR_SHIFT; - /* Next, remove old protection, irq first due to bh conflict. */ + /* + * Next, remove old protection, in decreasing order of strength + * to avoid unlock paths that aren't safe in the stronger + * context. Namely: BH can not be enabled with disabled interrupts. + * Additionally PREEMPT_RT requires that BH is enabled in preemptible + * context. + */ if (statesold & RCUTORTURE_RDR_IRQ) local_irq_enable(); - if (statesold & RCUTORTURE_RDR_BH) - local_bh_enable(); if (statesold & RCUTORTURE_RDR_PREEMPT) preempt_enable(); - if (statesold & RCUTORTURE_RDR_RBH) - rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_SCHED) rcu_read_unlock_sched(); + if (statesold & RCUTORTURE_RDR_BH) + local_bh_enable(); + if (statesold & RCUTORTURE_RDR_RBH) + rcu_read_unlock_bh(); if (statesold & RCUTORTURE_RDR_RCU) { bool lockit = !statesnew && !(torture_random(trsp) & 0xffff); @@ -1496,6 +1502,9 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) int mask = rcutorture_extend_mask_max(); unsigned long randmask1 = torture_random(trsp) >> 8; unsigned long randmask2 = randmask1 >> 3; + unsigned long preempts = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED; + unsigned long preempts_irq = preempts | RCUTORTURE_RDR_IRQ; + unsigned long bhs = RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; WARN_ON_ONCE(mask >> RCUTORTURE_RDR_SHIFT); /* Mostly only one bit (need preemption!), sometimes lots of bits. */ @@ -1503,11 +1512,26 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp) mask = mask & randmask2; else mask = mask & (1 << (randmask2 % RCUTORTURE_RDR_NBITS)); - /* Can't enable bh w/irq disabled. */ - if ((mask & RCUTORTURE_RDR_IRQ) && - ((!(mask & RCUTORTURE_RDR_BH) && (oldmask & RCUTORTURE_RDR_BH)) || - (!(mask & RCUTORTURE_RDR_RBH) && (oldmask & RCUTORTURE_RDR_RBH)))) - mask |= RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH; + + /* + * Can't enable bh w/irq disabled. + */ + if (mask & RCUTORTURE_RDR_IRQ) + mask |= oldmask & bhs; + + /* + * Ideally these sequences would be detected in debug builds + * (regardless of RT), but until then don't stop testing + * them on non-RT. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* Can't modify BH in atomic context */ + if (oldmask & preempts_irq) + mask &= ~bhs; + if ((oldmask | mask) & preempts_irq) + mask |= oldmask & bhs; + } + return mask ?: RCUTORTURE_RDR_RCU; } From b380b10b84c3b8334aba51a5a10eabb30f37589f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Jul 2021 15:14:56 -0700 Subject: [PATCH 23/46] torture: Make torture.sh print the number of files to be compressed Compressing gigabyte vmlinux files can take some time, and it can be a bit annoying to not know many more batches of compression there will be. This commit therefore makes torture.sh print the number of files to be compressed just before starting compression and just after compression completes. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 363f56081eff..8e882346d2a6 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -434,7 +434,12 @@ then batchno=1 if test -s $T/xz-todo then - echo Size before compressing: `du -sh $tdir | awk '{ print $1 }'` `date` 2>&1 | tee -a "$tdir/log-xz" | tee -a $T/log + for i in `cat $T/xz-todo` + do + find $i -name 'vmlinux*' -print + done | wc -l | awk '{ print $1 }' > $T/xz-todo-count + n2compress="`cat $T/xz-todo-count`" + echo Size before compressing $n2compress files: `du -sh $tdir | awk '{ print $1 }'` `date` 2>&1 | tee -a "$tdir/log-xz" | tee -a $T/log for i in `cat $T/xz-todo` do echo Compressing vmlinux files in ${i}: `date` >> "$tdir/log-xz" 2>&1 @@ -456,7 +461,7 @@ then echo Waiting for final batch $batchno of $ncompresses compressions `date` | tee -a "$tdir/log-xz" | tee -a $T/log fi wait - echo Size after compressing: `du -sh $tdir | awk '{ print $1 }'` `date` 2>&1 | tee -a "$tdir/log-xz" | tee -a $T/log + echo Size after compressing $n2compress files: `du -sh $tdir | awk '{ print $1 }'` `date` 2>&1 | tee -a "$tdir/log-xz" | tee -a $T/log echo Total duration `get_starttime_duration $starttime`. | tee -a $T/log else echo No compression needed: `date` >> "$tdir/log-xz" 2>&1 From 96017bf9039763a2e02dcc6adaa18592cd73a39d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jul 2021 10:53:41 -0700 Subject: [PATCH 24/46] rcu-tasks: Simplify trc_read_check_handler() atomic operations Currently, trc_wait_for_one_reader() atomically increments the trc_n_readers_need_end counter before sending the IPI invoking trc_read_check_handler(). All failure paths out of trc_read_check_handler() and also from the smp_call_function_single() within trc_wait_for_one_reader() must carefully atomically decrement this counter. This is more complex than it needs to be. This commit therefore simplifies things and saves a few lines of code by dispensing with the atomic decrements in favor of having trc_read_check_handler() do the atomic increment only in the success case. In theory, this represents no change in functionality. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 3b2f8038064a..c9d8583ffe59 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -890,32 +890,24 @@ static void trc_read_check_handler(void *t_in) // If the task is no longer running on this CPU, leave. if (unlikely(texp != t)) { - if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) - wake_up(&trc_wait); goto reset_ipi; // Already on holdout list, so will check later. } // If the task is not in a read-side critical section, and // if this is the last reader, awaken the grace-period kthread. if (likely(!READ_ONCE(t->trc_reader_nesting))) { - if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) - wake_up(&trc_wait); - // Mark as checked after decrement to avoid false - // positives on the above WARN_ON_ONCE(). WRITE_ONCE(t->trc_reader_checked, true); goto reset_ipi; } // If we are racing with an rcu_read_unlock_trace(), try again later. - if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) { - if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end))) - wake_up(&trc_wait); + if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) goto reset_ipi; - } WRITE_ONCE(t->trc_reader_checked, true); // Get here if the task is in a read-side critical section. Set // its state so that it will awaken the grace-period kthread upon // exit from that critical section. + atomic_inc(&trc_n_readers_need_end); // One more to wait on. WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)); WRITE_ONCE(t->trc_reader_special.b.need_qs, true); @@ -1015,21 +1007,15 @@ static void trc_wait_for_one_reader(struct task_struct *t, if (per_cpu(trc_ipi_to_cpu, cpu) || t->trc_ipi_to_cpu >= 0) return; - atomic_inc(&trc_n_readers_need_end); per_cpu(trc_ipi_to_cpu, cpu) = true; t->trc_ipi_to_cpu = cpu; rcu_tasks_trace.n_ipis++; - if (smp_call_function_single(cpu, - trc_read_check_handler, t, 0)) { + if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) { // Just in case there is some other reason for // failure than the target CPU being offline. rcu_tasks_trace.n_ipis_fails++; per_cpu(trc_ipi_to_cpu, cpu) = false; t->trc_ipi_to_cpu = cpu; - if (atomic_dec_and_test(&trc_n_readers_need_end)) { - WARN_ON_ONCE(1); - wake_up(&trc_wait); - } } } } From 18f08e758f34e6dfe0668bee51bd2af7adacf381 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jul 2021 11:32:28 -0700 Subject: [PATCH 25/46] rcu-tasks: Add trc_inspect_reader() checks for exiting critical section Currently, trc_inspect_reader() treats a task exiting its RCU Tasks Trace read-side critical section the same as being within that critical section. However, this can fail because that task might have already checked its .need_qs field, which means that it might never decrement the all-important trc_n_readers_need_end counter. Of course, for that to happen, the task would need to never again execute an RCU Tasks Trace read-side critical section, but this really could happen if the system's last trampoline was removed. Note that exit from such a critical section cannot be treated as a quiescent state due to the possibility of nested critical sections. This means that if trc_inspect_reader() sees a negative nesting value, it must set up to try again later. This commit therefore ignores tasks that are exiting their RCU Tasks Trace read-side critical sections so that they will be rechecked later. [ paulmck: Apply feedback from Neeraj Upadhyay and Boqun Feng. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c9d8583ffe59..8387e70e6b00 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -923,7 +923,7 @@ reset_ipi: static bool trc_inspect_reader(struct task_struct *t, void *arg) { int cpu = task_cpu(t); - bool in_qs = false; + int nesting; bool ofl = cpu_is_offline(cpu); if (task_curr(t)) { @@ -943,18 +943,18 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg) n_heavy_reader_updates++; if (ofl) n_heavy_reader_ofl_updates++; - in_qs = true; + nesting = 0; } else { // The task is not running, so C-language access is safe. - in_qs = likely(!t->trc_reader_nesting); + nesting = t->trc_reader_nesting; } - // Mark as checked so that the grace-period kthread will - // remove it from the holdout list. - t->trc_reader_checked = true; - - if (in_qs) - return true; // Already in quiescent state, done!!! + // If not exiting a read-side critical section, mark as checked + // so that the grace-period kthread will remove it from the + // holdout list. + t->trc_reader_checked = nesting >= 0; + if (nesting <= 0) + return !nesting; // If in QS, done, otherwise try again later. // The task is in a read-side critical section, so set up its // state so that it will awaken the grace-period kthread upon exit From a5c071ccfa1728508f31e61213ee795e4529d0d4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jul 2021 12:28:27 -0700 Subject: [PATCH 26/46] rcu-tasks: Remove second argument of rcu_read_unlock_trace_special() The second argument of rcu_read_unlock_trace_special() is always zero. When called from exit_tasks_rcu_finish_trace(), it is the constant zero, and rcu_read_unlock_trace_special() doesn't get called from rcu_read_unlock_trace() unless the value of local variable "nesting" is zero because in that case the early return is taken instead. This commit therefore removes the "nesting" argument from the rcu_read_unlock_trace_special() function, substituting the constant zero within that function. This commit also adds a WARN_ON_ONCE() to rcu_read_lock_trace_held() in case non-zeroness some day appears. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 5 +++-- kernel/rcu/tasks.h | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index 86c8f6c98412..6f9c35817398 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -31,7 +31,7 @@ static inline int rcu_read_lock_trace_held(void) #ifdef CONFIG_TASKS_TRACE_RCU -void rcu_read_unlock_trace_special(struct task_struct *t, int nesting); +void rcu_read_unlock_trace_special(struct task_struct *t); /** * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section @@ -80,7 +80,8 @@ static inline void rcu_read_unlock_trace(void) WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. } - rcu_read_unlock_trace_special(t, nesting); + WARN_ON_ONCE(nesting != 0); + rcu_read_unlock_trace_special(t); } void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8387e70e6b00..a3f4f9bd8c67 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -848,7 +848,7 @@ static void rcu_read_unlock_iw(struct irq_work *iwp) static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw); /* If we are the last reader, wake up the grace-period kthread. */ -void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) +void rcu_read_unlock_trace_special(struct task_struct *t) { int nq = READ_ONCE(t->trc_reader_special.b.need_qs); @@ -858,7 +858,7 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting) // Update .need_qs before ->trc_reader_nesting for irq/NMI handlers. if (nq) WRITE_ONCE(t->trc_reader_special.b.need_qs, false); - WRITE_ONCE(t->trc_reader_nesting, nesting); + WRITE_ONCE(t->trc_reader_nesting, 0); if (nq && atomic_dec_and_test(&trc_n_readers_need_end)) irq_work_queue(&rcu_tasks_trace_iw); } @@ -1200,7 +1200,7 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t) WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting)); WRITE_ONCE(t->trc_reader_nesting, 0); if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs))) - rcu_read_unlock_trace_special(t, 0); + rcu_read_unlock_trace_special(t); } /** From c4f113ac450afc9c7c4d2ce84a08f516dbec69b1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 09:54:45 -0700 Subject: [PATCH 27/46] rcu-tasks: Fix s/instruction/instructions/ typo in comment Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a3f4f9bd8c67..43c0f715ac63 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -758,7 +758,7 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); // 2. Protects code in the idle loop, exception entry/exit, and // CPU-hotplug code paths, similar to the capabilities of SRCU. // -// 3. Avoids expensive read-side instruction, having overhead similar +// 3. Avoids expensive read-side instructions, having overhead similar // to that of Preemptible RCU. // // There are of course downsides. The grace-period code can send IPIs to From 0db7c32ad3160ae06f497d48a74bd46a2a35e6bf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Aug 2021 09:07:44 -0700 Subject: [PATCH 28/46] rcu-tasks: Move RTGS_WAIT_CBS to beginning of rcu_tasks_kthread() loop Early in debugging, it made some sense to differentiate the first iteration from subsequent iterations, but now this just causes confusion. This commit therefore moves the "set_tasks_gp_state(rtp, RTGS_WAIT_CBS)" statement to the beginning of the "for" loop in rcu_tasks_kthread(). Reported-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 43c0f715ac63..7e2641783e43 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -197,6 +197,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) * This loop is terminated by the system going down. ;-) */ for (;;) { + set_tasks_gp_state(rtp, RTGS_WAIT_CBS); /* Pick up any new callbacks. */ raw_spin_lock_irqsave(&rtp->cbs_lock, flags); @@ -236,8 +237,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) } /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_idle(rtp->gp_sleep); - - set_tasks_gp_state(rtp, RTGS_WAIT_CBS); } } From d0a85858569ead8d39ba5b41501cd99bc7d7e7bd Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 18 Aug 2021 12:58:39 +0530 Subject: [PATCH 29/46] rcu-tasks: Fix s/rcu_add_holdout/trc_add_holdout/ typo in comment Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7e2641783e43..75e7888b3fc9 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -991,7 +991,7 @@ static void trc_wait_for_one_reader(struct task_struct *t, // If this task is not yet on the holdout list, then we are in // an RCU read-side critical section. Otherwise, the invocation of - // rcu_add_holdout() that added it to the list did the necessary + // trc_add_holdout() that added it to the list did the necessary // get_task_struct(). Either way, the task cannot be freed out // from under this code. From 89401176daf0a44ab517d9a0e296adb85af246df Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 18 Aug 2021 12:58:40 +0530 Subject: [PATCH 30/46] rcu-tasks: Correct firstreport usage in check_all_holdout_tasks_trace In check_all_holdout_tasks_trace(), firstreport is a pointer argument; so, check the dereferenced value, instead of checking the pointer. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 75e7888b3fc9..e2ec548fc0c4 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1129,7 +1129,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, cpus_read_unlock(); if (needreport) { - if (firstreport) + if (*firstreport) pr_err("INFO: rcu_tasks_trace detected stalls? (Late IPI?)\n"); show_stalled_ipi_trace(); } From d39ec8f3c12abe3710f7031ce3d5564bda12b19e Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 18 Aug 2021 12:58:41 +0530 Subject: [PATCH 31/46] rcu-tasks: Correct comparisons for CPU numbers in show_stalled_task_trace Valid CPU numbers can be zero or greater, but the checks for ->trc_ipi_to_cpu and tick_nohz_full_cpu()'s argument are for strictly greater than. This commit therefore corrects the check for no_hz_full cpu in show_stalled_task_trace() so as to include cpu 0. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index e2ec548fc0c4..af7388849bed 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1084,9 +1084,9 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) cpu = task_cpu(t); pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", t->pid, - ".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0], + ".I"[READ_ONCE(t->trc_ipi_to_cpu) >= 0], ".i"[is_idle_task(t)], - ".N"[cpu > 0 && tick_nohz_full_cpu(cpu)], + ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], READ_ONCE(t->trc_reader_nesting), " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], cpu); From a6517e9ce0115e33617062c9e73b4c5e6f787525 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 18 Aug 2021 12:58:43 +0530 Subject: [PATCH 32/46] rcu-tasks: Clarify read side section info for rcu_tasks_rude GP primitives RCU tasks rude variant does not check whether the current running context on a CPU is usermode. Read side critical section ends on transition to usermode execution, by the virtue of usermode execution being schedulable. Clarify this in comments for call_rcu_tasks_rude() and synchronize_rcu_tasks_rude(). Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index af7388849bed..8c63b4d23829 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -677,11 +677,11 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_tasks_rude() * assumes that the read-side critical sections end at context switch, - * cond_resched_rcu_qs(), or transition to usermode execution. As such, - * there are no read-side primitives analogous to rcu_read_lock() and - * rcu_read_unlock() because this primitive is intended to determine - * that all tasks have passed through a safe state, not so much for - * data-structure synchronization. + * cond_resched_rcu_qs(), or transition to usermode execution (as + * usermode execution is schedulable). As such, there are no read-side + * primitives analogous to rcu_read_lock() and rcu_read_unlock() because + * this primitive is intended to determine that all tasks have passed + * through a safe state, not so much for data-structure synchronization. * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. @@ -699,8 +699,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_rude); * grace period has elapsed, in other words after all currently * executing rcu-tasks read-side critical sections have elapsed. These * read-side critical sections are delimited by calls to schedule(), - * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory, - * anyway) cond_resched(). + * cond_resched_tasks_rcu_qs(), userspace execution (which is a schedulable + * context), and (in theory, anyway) cond_resched(). * * This is a very specialized primitive, intended only for a few uses in * tracing and other situations requiring manipulation of function preambles From ed42c38067129c85ab1bda39f2fd91924a432dc0 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 25 Aug 2021 12:40:50 +0530 Subject: [PATCH 33/46] rcu-tasks: Fix read-side primitives comment for call_rcu_tasks_trace call_rcu_tasks_trace() does have read-side primitives - rcu_read_lock_trace() and rcu_read_unlock_trace(). Fix this information in the comments. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 8c63b4d23829..47a29e411217 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1207,15 +1207,11 @@ static void exit_tasks_rcu_finish_trace(struct task_struct *t) * @rhp: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period * - * The callback function will be invoked some time after a full grace - * period elapses, in other words after all currently executing RCU - * read-side critical sections have completed. call_rcu_tasks_trace() - * assumes that the read-side critical sections end at context switch, - * cond_resched_rcu_qs(), or transition to usermode execution. As such, - * there are no read-side primitives analogous to rcu_read_lock() and - * rcu_read_unlock() because this primitive is intended to determine - * that all tasks have passed through a safe state, not so much for - * data-structure synchronization. + * The callback function will be invoked some time after a trace rcu-tasks + * grace period elapses, in other words after all currently executing + * trace rcu-tasks read-side critical sections have completed. These + * read-side critical sections are delimited by calls to rcu_read_lock_trace() + * and rcu_read_unlock_trace(). * * See the description of call_rcu() for more detailed information on * memory ordering guarantees. @@ -1231,7 +1227,7 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace); * * Control will return to the caller some time after a trace rcu-tasks * grace period has elapsed, in other words after all currently executing - * rcu-tasks read-side critical sections have elapsed. These read-side + * trace rcu-tasks read-side critical sections have elapsed. These read-side * critical sections are delimited by calls to rcu_read_lock_trace() * and rcu_read_unlock_trace(). * From 46aa886c483f57ef13cd5ea0a85e70b93eb1d381 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Fri, 27 Aug 2021 13:43:35 +0530 Subject: [PATCH 34/46] rcu-tasks: Fix IPI failure handling in trc_wait_for_one_reader The trc_wait_for_one_reader() function is called at multiple stages of trace rcu-tasks GP function, rcu_tasks_wait_gp(): - First, it is called as part of per task function - rcu_tasks_trace_pertask(), for all non-idle tasks. As part of per task processing, this function add the task in the holdout list and if the task is currently running on a CPU, it sends IPI to the task's CPU. The IPI handler takes action depending on whether task is in trace rcu-tasks read side critical section or not: - a. If the task is in trace rcu-tasks read side critical section (t->trc_reader_nesting != 0), the IPI handler sets the task's ->trc_reader_special.b.need_qs, so that this task notifies exit from its outermost read side critical section (by decrementing trc_n_readers_need_end) to the GP handling function. trc_wait_for_one_reader() also increments trc_n_readers_need_end, so that the trace rcu-tasks GP handler function waits for this task's read side exit notification. The IPI handler also sets t->trc_reader_checked to true, and no further IPIs are sent for this task, for this trace rcu-tasks grace period and this task can be removed from holdout list. - b. If the task is in the process of exiting its trace rcu-tasks read side critical section, (t->trc_reader_nesting < 0), defer this task's processing to future calls to trc_wait_for_one_reader(). - c. If task is not in rcu-task read side critical section, t->trc_reader_nesting == 0, ->trc_reader_checked is set for this task, so that this task is removed from holdout list. - Second, trc_wait_for_one_reader() is called as part of post scan, in function rcu_tasks_trace_postscan(), for all idle tasks. - Third, in function check_all_holdout_tasks_trace(), this function is called for each task in the holdout list, but only if there isn't a pending IPI for the task (->trc_ipi_to_cpu == -1). This function removed the task from holdout list, if IPI handler has completed the required work, to ensure that the current trace rcu-tasks grace period either waits for this task, or this task is not in a trace rcu-tasks read side critical section. Now, considering the scenario where smp_call_function_single() fails in first case, inside rcu_tasks_trace_pertask(). In this case, ->trc_ipi_to_cpu is set to the current CPU for that task. This will result in trc_wait_for_one_reader() getting skipped in third case, inside check_all_holdout_tasks_trace(), for this task. This further results in ->trc_reader_checked never getting set for this task, and the task not getting removed from holdout list. This can cause the current trace rcu-tasks grace period to stall. Fix the above problem, by resetting ->trc_ipi_to_cpu to -1, on smp_call_function_single() failure, so that future IPI calls can be send for this task. Note that all three of the trc_wait_for_one_reader() function's callers (rcu_tasks_trace_pertask(), rcu_tasks_trace_postscan(), check_all_holdout_tasks_trace()) hold cpu_read_lock(). This means that smp_call_function_single() cannot race with CPU hotplug, and thus should never fail. Therefore, also add a warning in order to report any such failure in case smp_call_function_single() grows some other reason for failure. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 47a29e411217..0c10c8407dca 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1012,9 +1012,11 @@ static void trc_wait_for_one_reader(struct task_struct *t, if (smp_call_function_single(cpu, trc_read_check_handler, t, 0)) { // Just in case there is some other reason for // failure than the target CPU being offline. + WARN_ONCE(1, "%s(): smp_call_function_single() failed for CPU: %d\n", + __func__, cpu); rcu_tasks_trace.n_ipis_fails++; per_cpu(trc_ipi_to_cpu, cpu) = false; - t->trc_ipi_to_cpu = cpu; + t->trc_ipi_to_cpu = -1; } } } From 8af9e2c7826a67a26c2c7a0cd3ce09a5acaf8035 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 15 Sep 2021 09:24:18 -0700 Subject: [PATCH 35/46] rcu-tasks: Update comments to cond_resched_tasks_rcu_qs() The cond_resched_rcu_qs() function no longer exists, despite being mentioned several times in kernel/rcu/tasks.h. This commit therefore updates it to the current cond_resched_tasks_rcu_qs(). Reported-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 0c10c8407dca..66e7586a33e9 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -368,7 +368,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) //////////////////////////////////////////////////////////////////////// // // Simple variant of RCU whose quiescent states are voluntary context -// switch, cond_resched_rcu_qs(), user-space execution, and idle. +// switch, cond_resched_tasks_rcu_qs(), user-space execution, and idle. // As such, grace periods can take one good long time. There are no // read-side primitives similar to rcu_read_lock() and rcu_read_unlock() // because this implementation is intended to get the system into a safe @@ -539,7 +539,7 @@ DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_tasks() assumes * that the read-side critical sections end at a voluntary context - * switch (not a preemption!), cond_resched_rcu_qs(), entry into idle, + * switch (not a preemption!), cond_resched_tasks_rcu_qs(), entry into idle, * or transition to usermode execution. As such, there are no read-side * primitives analogous to rcu_read_lock() and rcu_read_unlock() because * this primitive is intended to determine that all tasks have passed @@ -677,7 +677,7 @@ DEFINE_RCU_TASKS(rcu_tasks_rude, rcu_tasks_rude_wait_gp, call_rcu_tasks_rude, * period elapses, in other words after all currently executing RCU * read-side critical sections have completed. call_rcu_tasks_rude() * assumes that the read-side critical sections end at context switch, - * cond_resched_rcu_qs(), or transition to usermode execution (as + * cond_resched_tasks_rcu_qs(), or transition to usermode execution (as * usermode execution is schedulable). As such, there are no read-side * primitives analogous to rcu_read_lock() and rcu_read_unlock() because * this primitive is intended to determine that all tasks have passed From 925da92ba5cb0c82d07cdd5049a07e40f54e9c44 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 26 Aug 2021 22:21:22 -0400 Subject: [PATCH 36/46] rcu: Avoid unneeded function call in rcu_read_unlock() Since commit aa40c138cc8f3 ("rcu: Report QS for outermost PREEMPT=n rcu_read_unlock() for strict GPs") the function rcu_read_unlock_strict() is invoked by the inlined rcu_read_unlock() function. However, rcu_read_unlock_strict() is an empty function in production kernels, which are built with CONFIG_RCU_STRICT_GRACE_PERIOD=n. There is a mention of rcu_read_unlock_strict() in the BPF verifier, but this is in a deny-list, meaning that BPF does not care whether rcu_read_unlock_strict() is ever called. This commit therefore provides a slight performance improvement by hoisting the check of CONFIG_RCU_STRICT_GRACE_PERIOD from rcu_read_unlock_strict() into rcu_read_unlock(), thus avoiding the pointless call to an empty function. Cc: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Waiman Long Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 3 ++- kernel/rcu/tree_plugin.h | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 434d12fe2d4f..5e0beb5c5659 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -71,7 +71,8 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { preempt_enable(); - rcu_read_unlock_strict(); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d070059163d7..1a6fdb03d0a5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -814,8 +814,7 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) || - irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) return; rdp = this_cpu_ptr(&rcu_data); rcu_report_qs_rdp(rdp); From 2f611d044b8dcab245b6bbe5f691b6dce173ff56 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 Jul 2021 14:12:54 -0700 Subject: [PATCH 37/46] scftorture: Allow zero weight to exclude an smp_call_function*() category This commit reworks the weighting calculations to allow zero to be specified to disable a given weight. For example, specifying the scftorture.weight_resched=0 kernel boot parameter without specifying a non-zero value for any of the other scftorture.weight_* parameters would provide the default weights for the others, but would refrain from doing any resched-based IPIs. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 64a08288b1a6..bc3f8e26345c 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -553,18 +553,18 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); - if (weight_resched == -1 && - weight_single == -1 && weight_single_rpc == -1 && weight_single_wait == -1 && - weight_many == -1 && weight_many_wait == -1 && - weight_all == -1 && weight_all_wait == -1) { - weight_resched1 = 2 * nr_cpu_ids; - weight_single1 = 2 * nr_cpu_ids; - weight_single_rpc1 = 2 * nr_cpu_ids; - weight_single_wait1 = 2 * nr_cpu_ids; - weight_many1 = 2; - weight_many_wait1 = 2; - weight_all1 = 1; - weight_all_wait1 = 1; + if (weight_resched <= 0 && + weight_single <= 0 && weight_single_rpc <= 0 && weight_single_wait <= 0 && + weight_many <= 0 && weight_many_wait <= 0 && + weight_all <= 0 && weight_all_wait <= 0) { + weight_resched1 = weight_resched == 0 ? 0 : 2 * nr_cpu_ids; + weight_single1 = weight_single == 0 ? 0 : 2 * nr_cpu_ids; + weight_single_rpc1 = weight_single_rpc == 0 ? 0 : 2 * nr_cpu_ids; + weight_single_wait1 = weight_single_wait == 0 ? 0 : 2 * nr_cpu_ids; + weight_many1 = weight_many == 0 ? 0 : 2; + weight_many_wait1 = weight_many_wait == 0 ? 0 : 2; + weight_all1 = weight_all == 0 ? 0 : 1; + weight_all_wait1 = weight_all_wait == 0 ? 0 : 1; } else { if (weight_resched == -1) weight_resched1 = 0; From 2b1388f8a408e68fda6443ec166f42ae4ffca87c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 Jul 2021 14:20:35 -0700 Subject: [PATCH 38/46] scftorture: Shut down if nonsensical arguments given If (say) a 10-hour scftorture run is started, but the module parameters are so nonsensical that the run doesn't even start, then scftorture will wait the full ten hours when run built into a guest OS. This commit therefore shuts down the system in this case so that the error is reported immediately instead of ten hours hence. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index bc3f8e26345c..31b458b3b113 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -651,6 +651,10 @@ static int __init scf_torture_init(void) unwind: torture_init_end(); scf_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_SCF_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } From da9366c627ef459a1ceb6e5535648683e2adbdb2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 Jul 2021 15:13:56 -0700 Subject: [PATCH 39/46] scftorture: Account for weight_resched when checking for all zeroes The "all zero weights makes no sense" error is emitted even when scftorture.weight_resched is non-zero because it was left out of the enclosing "if" condition. This commit adds it in. Fixes: 1ac78b49d61d4 ("scftorture: Add an alternative IPI vector") Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 31b458b3b113..74348abc792b 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -583,8 +583,8 @@ static int __init scf_torture_init(void) if (weight_all_wait == -1) weight_all_wait1 = 0; } - if (weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 && - weight_many1 == 0 && weight_many_wait1 == 0 && + if (weight_resched1 == 0 && weight_single1 == 0 && weight_single_rpc1 == 0 && + weight_single_wait1 == 0 && weight_many1 == 0 && weight_many_wait1 == 0 && weight_all1 == 0 && weight_all_wait1 == 0) { VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense"); firsterr = -EINVAL; From c3d0258d5af2a50529e8928fe458344e38653d25 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Jul 2021 06:53:51 -0700 Subject: [PATCH 40/46] scftorture: Count reschedule IPIs Currently, only those IPIs that invoke scftorture's scf_handler() IPI handler function are counted. This means that runs exercising only scftorture.weight_resched will look like they have made no forward progress, resulting in "GP HANG" complaints from the rcutorture scripting. This commit therefore increments the scf_invoked_count per-CPU counter immediately after calling resched_cpu(). Fixes: 1ac78b49d61d4 ("scftorture: Add an alternative IPI vector") Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 74348abc792b..00bba2bdd4d1 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -341,6 +341,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra cpu = torture_random(trsp) % nr_cpu_ids; scfp->n_resched++; resched_cpu(cpu); + this_cpu_inc(scf_invoked_count); } break; case SCF_PRIM_SINGLE: From f2bdf7dc0da234d78636994367e6ea4af055c689 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 16:01:51 -0700 Subject: [PATCH 41/46] scftorture: Warn on individual scf_torture_init() error conditions When running scftorture as a module, any scf_torture_init() issues will be reflected in the error code from modprobe or insmod, as the case may be. However, these error codes are not available when running scftorture built-in, for example, when using the kvm.sh script. This commit therefore adds WARN_ON_ONCE() to allow distinguishing scf_torture_init() errors when running scftorture built-in. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 00bba2bdd4d1..5d42f44e3e1a 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -606,17 +606,17 @@ static int __init scf_torture_init(void) if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shutdown_secs > 0) { firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stutter > 0) { firsterr = torture_stutter_init(stutter, stutter); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } @@ -637,12 +637,12 @@ static int __init scf_torture_init(void) scf_stats_p[i].cpu = i; firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i], scf_stats_p[i].task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } From 2010776f8ccb68b85efbade3f19a11b17fb33d74 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jul 2021 10:08:12 -0700 Subject: [PATCH 42/46] tools/rcu: Add an extract-stall script This commit adds a script that extracts RCU CPU stall warnings from console output. The user can optionally specify the number of lines preceding the stall to output, and also the number of lines of stall-warning text. Signed-off-by: Paul E. McKenney --- tools/rcu/extract-stall.sh | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 tools/rcu/extract-stall.sh diff --git a/tools/rcu/extract-stall.sh b/tools/rcu/extract-stall.sh new file mode 100644 index 000000000000..e565697c9f90 --- /dev/null +++ b/tools/rcu/extract-stall.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Extract any RCU CPU stall warnings present in specified file. +# Filter out clocksource lines. Note that preceding-lines excludes the +# initial line of the stall warning but trailing-lines includes it. +# +# Usage: extract-stall.sh dmesg-file [ preceding-lines [ trailing-lines ] ] + +echo $1 +preceding_lines="${2-3}" +trailing_lines="${3-10}" + +awk -v preceding_lines="$preceding_lines" -v trailing_lines="$trailing_lines" ' +suffix <= 0 { + for (i = preceding_lines; i > 0; i--) + last[i] = last[i - 1]; + last[0] = $0; +} + +suffix > 0 { + print $0; + suffix--; + if (suffix <= 0) + print ""; +} + +suffix <= 0 && /detected stall/ { + for (i = preceding_lines; i >= 0; i--) + if (last[i] != "") + print last[i]; + suffix = trailing_lines; +}' < "$1" | tr -d '\015' | grep -v clocksource + From ae3357ac11273fe1aad9c790febf179ef05b930a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Jul 2021 16:28:43 -0700 Subject: [PATCH 43/46] torture: Allot 1G of memory for scftorture runs By default, torture.sh allots 512M of memory for each guest OS. However, when running scftorture with KASAN, 1G is needed. This commit therefore causes torture.sh to provide the required 1G. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 8e882346d2a6..eae88aacca2a 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -351,7 +351,7 @@ fi if test "$do_scftorture" = "yes" then torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot" - torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --trust-make + torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 1G --trust-make fi if test "$do_refscale" = yes From faaaf2ac03a81ad6f9dece28d9cb0b65b515a5cb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 10:51:11 -0700 Subject: [PATCH 44/46] torture: Make kvm-remote.sh print size of downloaded tarball This commit causes kvm-remote.sh to print the size of the tarball that is downloaded to each of the remote systems. This size can help with performance projections and analysis. Signed-off-by: Paul E. McKenney --- tools/testing/selftests/rcutorture/bin/kvm-remote.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh index 03126eb6ec5a..c7d42ef80c53 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-remote.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-remote.sh @@ -149,6 +149,7 @@ do done # Download and expand the tarball on all systems. +echo Build-products tarball: `du -h $T/binres.tgz` | tee -a "$oldrun/remote-log" for i in $systems do echo Downloading tarball to $i `date` | tee -a "$oldrun/remote-log" From 7663ad9a5dbcc27f3090e6bfd192c7e59222709f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Sep 2021 10:40:21 +0200 Subject: [PATCH 45/46] rcu: Always inline rcu_dynticks_task*_{enter,exit}() RCU managed to grow a few noinstr violations: vmlinux.o: warning: objtool: rcu_dynticks_eqs_enter()+0x0: call to rcu_dynticks_task_trace_enter() leaves .noinstr.text section vmlinux.o: warning: objtool: rcu_dynticks_eqs_exit()+0xe: call to rcu_dynticks_task_trace_exit() leaves .noinstr.text section Fix them by adding __always_inline to the relevant trivial functions. Also replace the noinstr with __always_inline for the existing rcu_dynticks_task_*() functions since noinstr would force noinline them, even when empty, which seems silly. Fixes: 7d0c9c50c5a1 ("rcu-tasks: Avoid IPIing userspace/idle tasks if kernel is so built") Reported-by: Stephen Rothwell Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1a6fdb03d0a5..5199559fbbf0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1479,7 +1479,7 @@ static void rcu_bind_gp_kthread(void) } /* Record the current task on dyntick-idle entry. */ -static void noinstr rcu_dynticks_task_enter(void) +static __always_inline void rcu_dynticks_task_enter(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); @@ -1487,7 +1487,7 @@ static void noinstr rcu_dynticks_task_enter(void) } /* Record no current task on dyntick-idle exit. */ -static void noinstr rcu_dynticks_task_exit(void) +static __always_inline void rcu_dynticks_task_exit(void) { #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); @@ -1495,7 +1495,7 @@ static void noinstr rcu_dynticks_task_exit(void) } /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ -static void rcu_dynticks_task_trace_enter(void) +static __always_inline void rcu_dynticks_task_trace_enter(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) @@ -1504,7 +1504,7 @@ static void rcu_dynticks_task_trace_enter(void) } /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ -static void rcu_dynticks_task_trace_exit(void) +static __always_inline void rcu_dynticks_task_trace_exit(void) { #ifdef CONFIG_TASKS_TRACE_RCU if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) From 74aece72f95f399dd29363669dc32a1344c8fab4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Sep 2021 10:40:22 +0200 Subject: [PATCH 46/46] rcu: Fix rcu_dynticks_curr_cpu_in_eqs() vs noinstr vmlinux.o: warning: objtool: rcu_nmi_enter()+0x36: call to __kasan_check_read() leaves .noinstr.text section noinstr cannot have atomic_*() functions in because they're explicitly annotated, use arch_atomic_*(). Fixes: 2be57f732889 ("rcu: Weaken ->dynticks accesses and updates") Reported-by: Stephen Rothwell Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6a1e9d3374db..ef8d36f580fc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -327,7 +327,7 @@ static void rcu_dynticks_eqs_online(void) */ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) { - return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); + return !(arch_atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1); } /*