From ca16265aaf9d357035000833636dcddbfafacac3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Nov 2023 14:11:27 -0500 Subject: [PATCH 01/34] rcu/nocb: Remove needless LOAD-ACQUIRE The LOAD-ACQUIRE access performed on rdp->nocb_cb_sleep advertizes ordering callback execution against grace period completion. However this is contradicted by the following: * This LOAD-ACQUIRE doesn't pair with anything. The only counterpart barrier that can be found is the smp_mb() placed after callbacks advancing in nocb_gp_wait(). However the barrier is placed _after_ ->nocb_cb_sleep write. * Callbacks can be concurrently advanced between the LOAD-ACQUIRE on ->nocb_cb_sleep and the call to rcu_segcblist_extract_done_cbs() in rcu_do_batch(), making any ordering based on ->nocb_cb_sleep broken. * Both rcu_segcblist_extract_done_cbs() and rcu_advance_cbs() are called under the nocb_lock, the latter hereby providing already the desired ACQUIRE semantics. Therefore it is safe to access ->nocb_cb_sleep with a simple compiler barrier. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree_nocb.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 4efbf7333d4e..785946834c6b 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -933,8 +933,7 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + if (READ_ONCE(rdp->nocb_cb_sleep)) { WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } From 1e8e6951a5774c8dd9d1f14af9c5b7d66130d96f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Nov 2023 14:11:28 -0500 Subject: [PATCH 02/34] rcu/nocb: Remove needless full barrier after callback advancing A full barrier is issued from nocb_gp_wait() upon callbacks advancing to order grace period completion with callbacks execution. However these two events are already ordered by the smp_mb__after_unlock_lock() barrier within the call to raw_spin_lock_rcu_node() that is necessary for callbacks advancing to happen. The following litmus test shows the kind of guarantee that this barrier provides: C smp_mb__after_unlock_lock {} // rcu_gp_cleanup() P0(spinlock_t *rnp_lock, int *gpnum) { // Grace period cleanup increase gp sequence number spin_lock(rnp_lock); WRITE_ONCE(*gpnum, 1); spin_unlock(rnp_lock); } // nocb_gp_wait() P1(spinlock_t *rnp_lock, spinlock_t *nocb_lock, int *gpnum, int *cb_ready) { int r1; // Call rcu_advance_cbs() from nocb_gp_wait() spin_lock(nocb_lock); spin_lock(rnp_lock); smp_mb__after_unlock_lock(); r1 = READ_ONCE(*gpnum); WRITE_ONCE(*cb_ready, 1); spin_unlock(rnp_lock); spin_unlock(nocb_lock); } // nocb_cb_wait() P2(spinlock_t *nocb_lock, int *cb_ready, int *cb_executed) { int r2; // rcu_do_batch() -> rcu_segcblist_extract_done_cbs() spin_lock(nocb_lock); r2 = READ_ONCE(*cb_ready); spin_unlock(nocb_lock); // Actual callback execution WRITE_ONCE(*cb_executed, 1); } P3(int *cb_executed, int *gpnum) { int r3; WRITE_ONCE(*cb_executed, 2); smp_mb(); r3 = READ_ONCE(*gpnum); } exists (1:r1=1 /\ 2:r2=1 /\ cb_executed=2 /\ 3:r3=0) (* Bad outcome. *) Here the bad outcome only occurs if the smp_mb__after_unlock_lock() is removed. This barrier orders the grace period completion against callbacks advancing and even later callbacks invocation, thanks to the opportunistic propagation via the ->nocb_lock to nocb_cb_wait(). Therefore the smp_mb() placed after callbacks advancing can be safely removed. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 6 ++++++ kernel/rcu/tree_nocb.h | 1 - 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2bccfd37c38..d540d210e5c7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2145,6 +2145,12 @@ static void rcu_do_batch(struct rcu_data *rdp) * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. + * + * Callbacks execution is fully ordered against preceding grace period + * completion (materialized by rnp->gp_seq update) thanks to the + * smp_mb__after_unlock_lock() upon node locking required for callbacks + * advancing. In NOCB mode this ordering is then further relayed through + * the nocb locking that protects both callbacks advancing and extraction. */ rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 785946834c6b..b2c3145c4c13 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -779,7 +779,6 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) if (rcu_segcblist_ready_cbs(&rdp->cblist)) { needwake = rdp->nocb_cb_sleep; WRITE_ONCE(rdp->nocb_cb_sleep, false); - smp_mb(); /* CB invocation -after- GP end. */ } else { needwake = false; } From b913c3fe685e0aec80130975b0f330fd709ff324 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Jan 2024 23:24:00 +0100 Subject: [PATCH 03/34] rcu/nocb: Make IRQs disablement symmetric Currently IRQs are disabled on call_rcu() and then depending on the context: * If the CPU is in nocb mode: - If the callback is enqueued in the bypass list, IRQs are re-enabled implictly by rcu_nocb_try_bypass() - If the callback is enqueued in the normal list, IRQs are re-enabled implicitly by __call_rcu_nocb_wake() * If the CPU is NOT in nocb mode, IRQs are reenabled explicitly from call_rcu() This makes the code a bit hard to follow, especially as it interleaves with nocb locking. To make the IRQ flags coverage clearer and also in order to prepare for moving all the nocb enqueue code to its own function, always re-enable the IRQ flags explicitly from call_rcu(). Reviewed-by: Neeraj Upadhyay (AMD) Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 9 ++++++--- kernel/rcu/tree_nocb.h | 20 +++++++++----------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d540d210e5c7..a402dc4e9a9c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2735,8 +2735,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } check_cb_ovld(rdp); - if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { + local_irq_restore(flags); return; // Enqueued onto ->nocb_bypass, so just leave. + } // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kvfree_rcu_offset((unsigned long)func)) @@ -2754,8 +2756,8 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); - local_irq_restore(flags); } + local_irq_restore(flags); } #ifdef CONFIG_RCU_LAZY @@ -4646,8 +4648,9 @@ void rcutree_migrate_callbacks(int cpu) __call_rcu_nocb_wake(my_rdp, true, flags); } else { rcu_nocb_unlock(my_rdp); /* irqs remain disabled. */ - raw_spin_unlock_irqrestore_rcu_node(my_rnp, flags); + raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */ } + local_irq_restore(flags); if (needwake) rcu_gp_kthread_wake(); lockdep_assert_irqs_enabled(); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index b2c3145c4c13..1d5c03c5c702 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -532,9 +532,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // 2. Both of these conditions are met: // a. The bypass list previously had only lazy CBs, and: // b. The new CB is non-lazy. - if (ncbs && (!bypass_is_lazy || lazy)) { - local_irq_restore(flags); - } else { + if (!ncbs || (bypass_is_lazy && !lazy)) { // No-CBs GP kthread might be indefinitely asleep, if so, wake. rcu_nocb_lock(rdp); // Rare during call_rcu() flood. if (!rcu_segcblist_pend_cbs(&rdp->cblist)) { @@ -544,7 +542,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, } else { trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQnoWake")); - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); } } return true; // Callback already enqueued. @@ -570,7 +568,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); if (rcu_nocb_poll || !t) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNotPoll")); return; @@ -583,17 +581,17 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rdp->qlen_last_fqs_check = len; // Only lazy CBs in bypass list if (lazy_len && bypass_len == lazy_len) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); } else if (!irqs_disabled_flags(flags)) { /* ... if queue was empty ... */ - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); } @@ -611,15 +609,15 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && !timer_pending(&rdp->nocb_timer)) { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } else { - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_nocb_unlock(rdp); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot")); } } From afd4e6964745ed98b74cacdcce21d73280a0a253 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 9 Jan 2024 23:24:01 +0100 Subject: [PATCH 04/34] rcu/nocb: Re-arrange call_rcu() NOCB specific code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the call_rcu() function interleaves NOCB and !NOCB enqueue code in a complicated way such that: * The bypass enqueue code may or may not have enqueued and may or may not have locked the ->nocb_lock. Everything that follows is in a Schrödinger locking state for the unwary reviewer's eyes. * The was_alldone is always set but only used in NOCB related code. * The NOCB wake up is distantly related to the locking hopefully performed by the bypass enqueue code that did not enqueue on the bypass list. Unconfuse the whole and gather NOCB and !NOCB specific enqueue code to their own functions. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 44 +++++++++++++++++++----------------------- kernel/rcu/tree.h | 9 ++++----- kernel/rcu/tree_nocb.h | 18 ++++++++++++++--- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a402dc4e9a9c..cc0e169e299a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2597,12 +2597,26 @@ static int __init rcu_spawn_core_kthreads(void) return 0; } +static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) +{ + rcu_segcblist_enqueue(&rdp->cblist, head); + if (__is_kvfree_rcu_offset((unsigned long)func)) + trace_rcu_kvfree_callback(rcu_state.name, head, + (unsigned long)func, + rcu_segcblist_n_cbs(&rdp->cblist)); + else + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); +} + /* * Handle any core-RCU processing required by a call_rcu() invocation. */ -static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, - unsigned long flags) +static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags) { + rcutree_enqueue(rdp, head, func); /* * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. @@ -2698,7 +2712,6 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) unsigned long flags; bool lazy; struct rcu_data *rdp; - bool was_alldone; /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); @@ -2735,28 +2748,11 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } check_cb_ovld(rdp); - if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { - local_irq_restore(flags); - return; // Enqueued onto ->nocb_bypass, so just leave. - } - // If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock. - rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); + + if (unlikely(rcu_rdp_is_offloaded(rdp))) + call_rcu_nocb(rdp, head, func, flags, lazy); else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); - - trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); - - /* Go handle any RCU core processing required. */ - if (unlikely(rcu_rdp_is_offloaded(rdp))) { - __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ - } else { - __call_rcu_core(rdp, head, flags); - } + call_rcu_core(rdp, head, func, flags); local_irq_restore(flags); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e9821a8422db..bf478da89a8f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -467,11 +467,10 @@ static void rcu_init_one_nocb(struct rcu_node *rnp); static bool wake_nocb_gp(struct rcu_data *rdp, bool force); static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, unsigned long j, bool lazy); -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, - bool lazy); -static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, - unsigned long flags); +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy); +static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, + unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level); static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 1d5c03c5c702..9e8052ba14b9 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -622,6 +622,18 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, } } +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) +{ + bool was_alldone; + + if (!rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy)) { + /* Not enqueued on bypass but locked, do regular enqueue */ + rcutree_enqueue(rdp, head, func); + __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ + } +} + static int nocb_gp_toggle_rdp(struct rcu_data *rdp, bool *wake_state) { @@ -1764,10 +1776,10 @@ static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp, return true; } -static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, - bool *was_alldone, unsigned long flags, bool lazy) +static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head, + rcu_callback_t func, unsigned long flags, bool lazy) { - return false; + WARN_ON_ONCE(1); /* Should be dead code! */ } static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, From dda98810b552fc6bf650f4270edeebdc2f28bd3f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 10 Jan 2024 16:11:28 +0800 Subject: [PATCH 05/34] rcu/nocb: Fix WARN_ON_ONCE() in the rcu_nocb_bypass_lock() For the kernels built with CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y and CONFIG_RCU_LAZY=y, the following scenarios will trigger WARN_ON_ONCE() in the rcu_nocb_bypass_lock() and rcu_nocb_wait_contended() functions: CPU2 CPU11 kthread rcu_nocb_cb_kthread ksys_write rcu_do_batch vfs_write rcu_torture_timer_cb proc_sys_write __kmem_cache_free proc_sys_call_handler kmemleak_free drop_caches_sysctl_handler delete_object_full drop_slab __delete_object shrink_slab put_object lazy_rcu_shrink_scan call_rcu rcu_nocb_flush_bypass __call_rcu_commn rcu_nocb_bypass_lock raw_spin_trylock(&rdp->nocb_bypass_lock) fail atomic_inc(&rdp->nocb_lock_contended); rcu_nocb_wait_contended WARN_ON_ONCE(smp_processor_id() != rdp->cpu); WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)) | |_ _ _ _ _ _ _ _ _ _same rdp and rdp->cpu != 11_ _ _ _ _ _ _ _ _ __| Reproduce this bug with "echo 3 > /proc/sys/vm/drop_caches". This commit therefore uses rcu_nocb_try_flush_bypass() instead of rcu_nocb_flush_bypass() in lazy_rcu_shrink_scan(). If the nocb_bypass queue is being flushed, then rcu_nocb_try_flush_bypass will return directly. Signed-off-by: Zqiang Reviewed-by: Joel Fernandes (Google) Reviewed-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree_nocb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 9e8052ba14b9..ffa69a5e18f4 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1391,7 +1391,7 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) rcu_nocb_unlock_irqrestore(rdp, flags); continue; } - WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false)); + rcu_nocb_try_flush_bypass(rdp, jiffies); rcu_nocb_unlock_irqrestore(rdp, flags); wake_nocb_gp(rdp, false); sc->nr_to_scan -= _count; From f3c4c00784b5f7499d9cb6d31b661370c9a1ce7f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 17 Jan 2024 18:26:16 +0800 Subject: [PATCH 06/34] rcu/nocb: Check rdp_gp->nocb_timer in __call_rcu_nocb_wake() Currently, only rdp_gp->nocb_timer is used, for nocb_timer of no-rdp_gp structure, the timer_pending() is always return false, this commit therefore need to check rdp_gp->nocb_timer in __call_rcu_nocb_wake(). Signed-off-by: Zqiang Reviewed-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree_nocb.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index ffa69a5e18f4..f124d4d45ce6 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -564,6 +564,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, long lazy_len; long len; struct task_struct *t; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; // If we are being polled or there is no kthread, just leave. t = READ_ONCE(rdp->nocb_gp_kthread); @@ -608,7 +609,7 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, smp_mb(); /* Enqueue before timer_pending(). */ if ((rdp->nocb_cb_sleep || !rcu_segcblist_ready_cbs(&rdp->cblist)) && - !timer_pending(&rdp->nocb_timer)) { + !timer_pending(&rdp_gp->nocb_timer)) { rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE, TPS("WakeOvfIsDeferred")); From a7e4074dccd282f494d542150ef6235b3270b0a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 19 Dec 2023 00:19:16 +0100 Subject: [PATCH 07/34] rcu/exp: Remove full barrier upon main thread wakeup When an expedited grace period is ending, care must be taken so that all the quiescent states propagated up to the root are correctly ordered against the wake up of the main expedited grace period workqueue. This ordering is already carried through the root rnp locking augmented by an smp_mb__after_unlock_lock() barrier. Therefore the explicit smp_mb() placed before the wake up is not needed and can be removed. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree_exp.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 2ac440bc7e10..014ddf672165 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -198,10 +198,9 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, } if (rnp->parent == NULL) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ + if (wake) swake_up_one_online(&rcu_state.expedited_wq); - } + break; } mask = rnp->grpmask; From a636c5e6f8fc34be520277e69c7c6ee1d4fc1d17 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:15 +0100 Subject: [PATCH 08/34] rcu/exp: Fix RCU expedited parallel grace period kworker allocation failure recovery Under CONFIG_RCU_EXP_KTHREAD=y, the nodes initialization for expedited grace periods is queued to a kworker. However if the allocation of that kworker failed, the nodes initialization is performed synchronously by the caller instead. Now the check for kworker initialization failure relies on the kworker pointer to be NULL while its value might actually encapsulate an allocation failure error. Make sure to handle this case. Reviewed-by: Kalesh Singh Fixes: 9621fbee44df ("rcu: Move expedited grace period (GP) work to RT kthread_worker") Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2bccfd37c38..38c86f2c040b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4749,6 +4749,7 @@ static void __init rcu_start_exp_gp_kworkers(void) rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { pr_err("Failed to create %s!\n", par_gp_kworker_name); + rcu_exp_par_gp_kworker = NULL; kthread_destroy_worker(rcu_exp_gp_kworker); return; } From e7539ffc9a770f36bacedcf0fbfb4bf2f244f4a5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:16 +0100 Subject: [PATCH 09/34] rcu/exp: Handle RCU expedited grace period kworker allocation failure Just like is done for the kworker performing nodes initialization, gracefully handle the possible allocation failure of the RCU expedited grace period main kworker. While at it perform a rename of the related checking functions to better reflect the expedited specifics. Reviewed-by: Kalesh Singh Fixes: 9621fbee44df ("rcu: Move expedited grace period (GP) work to RT kthread_worker") Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 2 ++ kernel/rcu/tree_exp.h | 25 +++++++++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 38c86f2c040b..f2c10d351b59 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4743,6 +4743,7 @@ static void __init rcu_start_exp_gp_kworkers(void) rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { pr_err("Failed to create %s!\n", gp_kworker_name); + rcu_exp_gp_kworker = NULL; return; } @@ -4751,6 +4752,7 @@ static void __init rcu_start_exp_gp_kworkers(void) pr_err("Failed to create %s!\n", par_gp_kworker_name); rcu_exp_par_gp_kworker = NULL; kthread_destroy_worker(rcu_exp_gp_kworker); + rcu_exp_gp_kworker = NULL; return; } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 014ddf672165..6123a60d9a4d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -427,7 +427,12 @@ static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_exp_gp_kworker); +} + +static inline bool rcu_exp_par_worker_started(void) { return !!READ_ONCE(rcu_exp_par_gp_kworker); } @@ -477,7 +482,12 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) __sync_rcu_exp_select_node_cpus(rewp); } -static inline bool rcu_gp_par_worker_started(void) +static inline bool rcu_exp_worker_started(void) +{ + return !!READ_ONCE(rcu_gp_wq); +} + +static inline bool rcu_exp_par_worker_started(void) { return !!READ_ONCE(rcu_par_gp_wq); } @@ -540,7 +550,7 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!rcu_gp_par_worker_started() || + if (!rcu_exp_par_worker_started() || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { /* No worker started yet or last leaf, do direct call. */ @@ -955,7 +965,7 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - bool boottime = (rcu_scheduler_active == RCU_SCHEDULER_INIT); + bool use_worker; unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; @@ -966,6 +976,9 @@ void synchronize_rcu_expedited(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); + use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) && + rcu_exp_worker_started(); + /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. @@ -995,7 +1008,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(boottime)) { + if (unlikely(!use_worker)) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -1013,7 +1026,7 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - if (likely(!boottime)) + if (likely(use_worker)) synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 7836b270607676ed1c0c6a4a840a2ede9437a6a1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:17 +0100 Subject: [PATCH 10/34] rcu: s/boost_kthread_mutex/kthread_mutex This mutex is currently protecting per node boost kthreads creation and affinity setting across CPU hotplug operations. Since the expedited kworkers will soon be split per node as well, they will be subject to the same concurrency constraints against hotplug. Therefore their creation and affinity tuning operations will be grouped with those of boost kthreads and then rely on the same mutex. To prepare for that, generalize its name. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f2c10d351b59..cdb80835c469 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4918,7 +4918,7 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[2]); init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); - mutex_init(&rnp->boost_kthread_mutex); + mutex_init(&rnp->kthread_mutex); raw_spin_lock_init(&rnp->exp_poll_lock); rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e9821a8422db..13e7b0d907ab 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -113,7 +113,7 @@ struct rcu_node { /* side effect, not as a lock. */ unsigned long boost_time; /* When to start boosting (jiffies). */ - struct mutex boost_kthread_mutex; + struct mutex kthread_mutex; /* Exclusion for thread spawning and affinity */ /* manipulation. */ struct task_struct *boost_kthread_task; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 41021080ad25..0d307674915c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1195,7 +1195,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; - mutex_lock(&rnp->boost_kthread_mutex); + mutex_lock(&rnp->kthread_mutex); if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) goto out; @@ -1212,7 +1212,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ out: - mutex_unlock(&rnp->boost_kthread_mutex); + mutex_unlock(&rnp->kthread_mutex); } /* @@ -1224,7 +1224,7 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. * - * Any future concurrent calls are serialized via ->boost_kthread_mutex. + * Any future concurrent calls are serialized via ->kthread_mutex. */ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { @@ -1237,7 +1237,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - mutex_lock(&rnp->boost_kthread_mutex); + mutex_lock(&rnp->kthread_mutex); mask = rcu_rnp_online_cpus(rnp); for_each_leaf_node_possible_cpu(rnp, cpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && @@ -1250,7 +1250,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) cpumask_clear_cpu(outgoingcpu, cm); } set_cpus_allowed_ptr(t, cm); - mutex_unlock(&rnp->boost_kthread_mutex); + mutex_unlock(&rnp->kthread_mutex); free_cpumask_var(cm); } From c19e5d3b497a3036f800edf751dc7814e3e887e1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:18 +0100 Subject: [PATCH 11/34] rcu/exp: Move expedited kthread worker creation functions above rcutree_prepare_cpu() The expedited kthread worker performing the per node initialization is going to be split into per node kthreads. As such, the future per node kthread creation will need to be called from CPU hotplug callbacks instead of an initcall, right beside the per node boost kthread creation. To prepare for that, move the kthread worker creation above rcutree_prepare_cpu() as a first step to make the review smoother for the upcoming modifications. No intended functional change. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 96 +++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cdb80835c469..657ac12f9e27 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4394,6 +4394,54 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } +#ifdef CONFIG_RCU_EXP_KTHREAD +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + rcu_exp_gp_kworker = NULL; + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + rcu_exp_par_gp_kworker = NULL; + kthread_destroy_worker(rcu_exp_gp_kworker); + rcu_exp_gp_kworker = NULL; + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + +static inline void rcu_alloc_par_gp_wq(void) +{ +} +#else /* !CONFIG_RCU_EXP_KTHREAD */ +struct workqueue_struct *rcu_par_gp_wq; + +static void __init rcu_start_exp_gp_kworkers(void) +{ +} + +static inline void rcu_alloc_par_gp_wq(void) +{ + rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); + WARN_ON(!rcu_par_gp_wq); +} +#endif /* CONFIG_RCU_EXP_KTHREAD */ + /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. @@ -4730,54 +4778,6 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } -#ifdef CONFIG_RCU_EXP_KTHREAD -struct kthread_worker *rcu_exp_gp_kworker; -struct kthread_worker *rcu_exp_par_gp_kworker; - -static void __init rcu_start_exp_gp_kworkers(void) -{ - const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; - const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; - struct sched_param param = { .sched_priority = kthread_prio }; - - rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { - pr_err("Failed to create %s!\n", gp_kworker_name); - rcu_exp_gp_kworker = NULL; - return; - } - - rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { - pr_err("Failed to create %s!\n", par_gp_kworker_name); - rcu_exp_par_gp_kworker = NULL; - kthread_destroy_worker(rcu_exp_gp_kworker); - rcu_exp_gp_kworker = NULL; - return; - } - - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, - ¶m); -} - -static inline void rcu_alloc_par_gp_wq(void) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -struct workqueue_struct *rcu_par_gp_wq; - -static void __init rcu_start_exp_gp_kworkers(void) -{ -} - -static inline void rcu_alloc_par_gp_wq(void) -{ - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Spawn the kthreads that handle RCU's grace periods. */ From 8e5e621566485a3e160c0d8bfba206cb1d6b980d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:19 +0100 Subject: [PATCH 12/34] rcu/exp: Make parallel exp gp kworker per rcu node When CONFIG_RCU_EXP_KTHREAD=n, the expedited grace period per node initialization is performed in parallel via workqueues (one work per node). However in CONFIG_RCU_EXP_KTHREAD=y, this per node initialization is performed by a single kworker serializing each node initialization (one work for all nodes). The second part is certainly less scalable and efficient beyond a single leaf node. To improve this, expand this single kworker into per-node kworkers. This new layout is eventually intended to remove the workqueues based implementation since it will essentially now become duplicate code. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 1 - kernel/rcu/tree.c | 65 ++++++++++++++++++++++++++-------------- kernel/rcu/tree.h | 3 ++ kernel/rcu/tree_exp.h | 10 +++---- kernel/rcu/tree_plugin.h | 10 ++----- 5 files changed, 54 insertions(+), 35 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f94f65877f2b..6beaf70d629f 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -625,7 +625,6 @@ void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; #ifdef CONFIG_RCU_EXP_KTHREAD extern struct kthread_worker *rcu_exp_gp_kworker; -extern struct kthread_worker *rcu_exp_par_gp_kworker; #else /* !CONFIG_RCU_EXP_KTHREAD */ extern struct workqueue_struct *rcu_par_gp_wq; #endif /* CONFIG_RCU_EXP_KTHREAD */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 657ac12f9e27..398c099d45d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4396,33 +4396,39 @@ rcu_boot_init_percpu_data(int cpu) #ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_worker *rcu_exp_gp_kworker; -struct kthread_worker *rcu_exp_par_gp_kworker; -static void __init rcu_start_exp_gp_kworkers(void) +static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) { - const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; - const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct kthread_worker *kworker; + const char *name = "rcu_exp_par_gp_kthread_worker/%d"; + struct sched_param param = { .sched_priority = kthread_prio }; + int rnp_index = rnp - rcu_get_root(); + + if (rnp->exp_kworker) + return; + + kworker = kthread_create_worker(0, name, rnp_index); + if (IS_ERR_OR_NULL(kworker)) { + pr_err("Failed to create par gp kworker on %d/%d\n", + rnp->grplo, rnp->grphi); + return; + } + WRITE_ONCE(rnp->exp_kworker, kworker); + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); +} + +static void __init rcu_start_exp_gp_kworker(void) +{ + const char *name = "rcu_exp_gp_kthread_worker"; struct sched_param param = { .sched_priority = kthread_prio }; - rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + rcu_exp_gp_kworker = kthread_create_worker(0, name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { - pr_err("Failed to create %s!\n", gp_kworker_name); + pr_err("Failed to create %s!\n", name); rcu_exp_gp_kworker = NULL; return; } - - rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); - if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { - pr_err("Failed to create %s!\n", par_gp_kworker_name); - rcu_exp_par_gp_kworker = NULL; - kthread_destroy_worker(rcu_exp_gp_kworker); - rcu_exp_gp_kworker = NULL; - return; - } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); - sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, - ¶m); } static inline void rcu_alloc_par_gp_wq(void) @@ -4431,7 +4437,11 @@ static inline void rcu_alloc_par_gp_wq(void) #else /* !CONFIG_RCU_EXP_KTHREAD */ struct workqueue_struct *rcu_par_gp_wq; -static void __init rcu_start_exp_gp_kworkers(void) +static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) +{ +} + +static void __init rcu_start_exp_gp_kworker(void) { } @@ -4442,6 +4452,17 @@ static inline void rcu_alloc_par_gp_wq(void) } #endif /* CONFIG_RCU_EXP_KTHREAD */ +static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) +{ + if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) || + IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) { + mutex_lock(&rnp->kthread_mutex); + rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_exp_par_gp_kworker(rnp); + mutex_unlock(&rnp->kthread_mutex); + } +} + /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. @@ -4490,7 +4511,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcu_spawn_one_boost_kthread(rnp); + rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); @@ -4812,10 +4833,10 @@ static int __init rcu_spawn_gp_kthread(void) * due to rcu_scheduler_fully_active. */ rcu_spawn_cpu_nocb_kthread(smp_processor_id()); - rcu_spawn_one_boost_kthread(rdp->mynode); + rcu_spawn_rnp_kthreads(rdp->mynode); rcu_spawn_core_kthreads(); /* Create kthread worker for expedited GPs */ - rcu_start_exp_gp_kworkers(); + rcu_start_exp_gp_kworker(); return 0; } early_initcall(rcu_spawn_gp_kthread); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 13e7b0d907ab..e173808f486f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -72,6 +72,9 @@ struct rcu_node { /* Online CPUs for next expedited GP. */ /* Any CPU that has ever been online will */ /* have its bit set. */ + struct kthread_worker *exp_kworker; + /* Workers performing per node expedited GP */ + /* initialization. */ unsigned long cbovldmask; /* CPUs experiencing callback overload. */ unsigned long ffmask; /* Fully functional CPUs. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6123a60d9a4d..0318a8a062d5 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -432,9 +432,9 @@ static inline bool rcu_exp_worker_started(void) return !!READ_ONCE(rcu_exp_gp_kworker); } -static inline bool rcu_exp_par_worker_started(void) +static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) { - return !!READ_ONCE(rcu_exp_par_gp_kworker); + return !!READ_ONCE(rnp->exp_kworker); } static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) @@ -445,7 +445,7 @@ static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) * another work item on the same kthread worker can result in * deadlock. */ - kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); + kthread_queue_work(READ_ONCE(rnp->exp_kworker), &rnp->rew.rew_work); } static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) @@ -487,7 +487,7 @@ static inline bool rcu_exp_worker_started(void) return !!READ_ONCE(rcu_gp_wq); } -static inline bool rcu_exp_par_worker_started(void) +static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) { return !!READ_ONCE(rcu_par_gp_wq); } @@ -550,7 +550,7 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!rcu_exp_par_worker_started() || + if (!rcu_exp_par_worker_started(rnp) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { /* No worker started yet or last leaf, do direct call. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0d307674915c..09bdd36ca9ff 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1195,14 +1195,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) struct sched_param sp; struct task_struct *t; - mutex_lock(&rnp->kthread_mutex); - if (rnp->boost_kthread_task || !rcu_scheduler_fully_active) - goto out; + if (rnp->boost_kthread_task) + return; t = kthread_create(rcu_boost_kthread, (void *)rnp, "rcub/%d", rnp_index); if (WARN_ON_ONCE(IS_ERR(t))) - goto out; + return; raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; @@ -1210,9 +1209,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ - - out: - mutex_unlock(&rnp->kthread_mutex); } /* From b67cffcbbf9dc759d95d330a5af5d1480af2b1f1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:20 +0100 Subject: [PATCH 13/34] rcu/exp: Handle parallel exp gp kworkers affinity Affine the parallel expedited gp kworkers to their respective RCU node in order to make them close to the cache their are playing with. This reuses the boost kthreads machinery that probe into CPU hotplug operations such that the kthreads become/stay affine to their respective node as soon/long as they contain online CPUs. Otherwise and if the current CPU going down was the last online on the leaf node, the related kthread is affine to the housekeeping CPUs. In the long run, this affinity VS CPU hotplug operation game should probably be implemented at the generic kthread level. Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney [boqun: s/* rcu_boost_task/*rcu_boost_task as reported by checkpatch] Signed-off-by: Boqun Feng --- kernel/rcu/tree.c | 79 +++++++++++++++++++++++++++++++++++++--- kernel/rcu/tree_plugin.h | 42 ++------------------- 2 files changed, 78 insertions(+), 43 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 398c099d45d9..312c4c5d4509 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -145,7 +145,7 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -4417,6 +4417,16 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); } +static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) +{ + struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker); + + if (!kworker) + return NULL; + + return kworker->task; +} + static void __init rcu_start_exp_gp_kworker(void) { const char *name = "rcu_exp_gp_kthread_worker"; @@ -4441,6 +4451,11 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) { } +static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) +{ + return NULL; +} + static void __init rcu_start_exp_gp_kworker(void) { } @@ -4519,13 +4534,67 @@ int rcutree_prepare_cpu(unsigned int cpu) } /* - * Update RCU priority boot kthread affinity for CPU-hotplug changes. + * Update kthreads affinity during CPU-hotplug changes. + * + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + * + * Any future concurrent calls are serialized via ->kthread_mutex. */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoing) +static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) { - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + cpumask_var_t cm; + unsigned long mask; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct task_struct *task_boost, *task_exp; - rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); + if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST)) + return; + + rdp = per_cpu_ptr(&rcu_data, cpu); + rnp = rdp->mynode; + + task_boost = rcu_boost_task(rnp); + task_exp = rcu_exp_par_gp_task(rnp); + + /* + * If CPU is the boot one, those tasks are created later from early + * initcall since kthreadd must be created first. + */ + if (!task_boost && !task_exp) + return; + + if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) + return; + + mutex_lock(&rnp->kthread_mutex); + mask = rcu_rnp_online_cpus(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (cpumask_empty(cm)) { + cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); + if (outgoingcpu >= 0) + cpumask_clear_cpu(outgoingcpu, cm); + } + + if (task_exp) + set_cpus_allowed_ptr(task_exp, cm); + + if (task_boost) + set_cpus_allowed_ptr(task_boost, cm); + + mutex_unlock(&rnp->kthread_mutex); + + free_cpumask_var(cm); } /* diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 09bdd36ca9ff..36a8b5dbf5b5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1211,43 +1211,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } -/* - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->kthread_mutex. - */ -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static struct task_struct *rcu_boost_task(struct rcu_node *rnp) { - struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask; - cpumask_var_t cm; - int cpu; - - if (!t) - return; - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - mutex_lock(&rnp->kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - set_cpus_allowed_ptr(t, cm); - mutex_unlock(&rnp->kthread_mutex); - free_cpumask_var(cm); + return READ_ONCE(rnp->boost_kthread_task); } #else /* #ifdef CONFIG_RCU_BOOST */ @@ -1266,10 +1232,10 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +static struct task_struct *rcu_boost_task(struct rcu_node *rnp) { + return NULL; } - #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* From 23da2ad64dbe9f3fab10af90484fe41e144337b1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jan 2024 16:46:21 +0100 Subject: [PATCH 14/34] rcu/exp: Remove rcu_par_gp_wq TREE04 running on short iterations can produce writer stalls of the following kind: ??? Writer stall state RTWS_EXP_SYNC(4) g3968 f0x0 ->state 0x2 cpu 0 task:rcu_torture_wri state:D stack:14568 pid:83 ppid:2 flags:0x00004000 Call Trace: __schedule+0x2de/0x850 ? trace_event_raw_event_rcu_exp_funnel_lock+0x6d/0xb0 schedule+0x4f/0x90 synchronize_rcu_expedited+0x430/0x670 ? __pfx_autoremove_wake_function+0x10/0x10 ? __pfx_synchronize_rcu_expedited+0x10/0x10 do_rtws_sync.constprop.0+0xde/0x230 rcu_torture_writer+0x4b4/0xcd0 ? __pfx_rcu_torture_writer+0x10/0x10 kthread+0xc7/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Waiting for an expedited grace period and polling for an expedited grace period both are operations that internally rely on the same workqueue performing necessary asynchronous work. However, a dependency chain is involved between those two operations, as depicted below: ====== CPU 0 ======= ====== CPU 1 ======= synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); start_poll_synchronize_rcu_expedited queue_work(rcu_gp_wq, &rnp->exp_poll_wq); synchronize_rcu_expedited_queue_work() queue_work(rcu_gp_wq, &rew->rew_work); wait_event() // A, wait for &rew->rew_work completion mutex_unlock() // B //======> switch to kworker sync_rcu_do_polled_gp() { synchronize_rcu_expedited() exp_funnel_lock() mutex_lock(&rcu_state.exp_mutex); // C, wait B .... } // D Since workqueues are usually implemented on top of several kworkers handling the queue concurrently, the above situation wouldn't deadlock most of the time because A then doesn't depend on D. But in case of memory stress, a single kworker may end up handling alone all the works in a serialized way. In that case the above layout becomes a problem because A then waits for D, closing a circular dependency: A -> D -> C -> B -> A This however only happens when CONFIG_RCU_EXP_KTHREAD=n. Indeed synchronize_rcu_expedited() is otherwise implemented on top of a kthread worker while polling still relies on rcu_gp_wq workqueue, breaking the above circular dependency chain. Fix this with making expedited grace period to always rely on kthread worker. The workqueue based implementation is essentially a duplicate anyway now that the per-node initialization is performed by per-node kthread workers. Meanwhile the CONFIG_RCU_EXP_KTHREAD switch is still kept around to manage the scheduler policy of these kthread workers. Reported-by: Anna-Maria Behnsen Reported-by: Thomas Gleixner Suggested-by: Joel Fernandes Suggested-by: Paul E. McKenney Suggested-by: Neeraj upadhyay Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 4 --- kernel/rcu/tree.c | 40 ++++-------------------- kernel/rcu/tree.h | 6 +--- kernel/rcu/tree_exp.h | 73 +------------------------------------------ 4 files changed, 8 insertions(+), 115 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 6beaf70d629f..99032b9cb667 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -623,11 +623,7 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; -#ifdef CONFIG_RCU_EXP_KTHREAD extern struct kthread_worker *rcu_exp_gp_kworker; -#else /* !CONFIG_RCU_EXP_KTHREAD */ -extern struct workqueue_struct *rcu_par_gp_wq; -#endif /* CONFIG_RCU_EXP_KTHREAD */ void rcu_gp_slow_register(atomic_t *rgssp); void rcu_gp_slow_unregister(atomic_t *rgssp); #endif /* #else #ifdef CONFIG_TINY_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 312c4c5d4509..9591c22408a1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4394,7 +4394,6 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } -#ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) @@ -4414,7 +4413,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) return; } WRITE_ONCE(rnp->exp_kworker, kworker); - sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); + + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); } static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) @@ -4438,39 +4439,14 @@ static void __init rcu_start_exp_gp_kworker(void) rcu_exp_gp_kworker = NULL; return; } - sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); -} -static inline void rcu_alloc_par_gp_wq(void) -{ + if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); } -#else /* !CONFIG_RCU_EXP_KTHREAD */ -struct workqueue_struct *rcu_par_gp_wq; - -static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) -{ -} - -static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) -{ - return NULL; -} - -static void __init rcu_start_exp_gp_kworker(void) -{ -} - -static inline void rcu_alloc_par_gp_wq(void) -{ - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) { - if ((IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) || - IS_ENABLED(CONFIG_RCU_BOOST)) && rcu_scheduler_fully_active) { + if (rcu_scheduler_fully_active) { mutex_lock(&rnp->kthread_mutex); rcu_spawn_one_boost_kthread(rnp); rcu_spawn_exp_par_gp_kworker(rnp); @@ -4554,9 +4530,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) struct rcu_node *rnp; struct task_struct *task_boost, *task_exp; - if (!IS_ENABLED(CONFIG_RCU_EXP_KTHREAD) && !IS_ENABLED(CONFIG_RCU_BOOST)) - return; - rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; @@ -5245,7 +5218,6 @@ void __init rcu_init(void) /* Create workqueue for Tree SRCU and for expedited GPs. */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_alloc_par_gp_wq(); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e173808f486f..f35e47f24d80 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -21,14 +21,10 @@ #include "rcu_segcblist.h" -/* Communicate arguments to a workqueue handler. */ +/* Communicate arguments to a kthread worker handler. */ struct rcu_exp_work { unsigned long rew_s; -#ifdef CONFIG_RCU_EXP_KTHREAD struct kthread_work rew_work; -#else - struct work_struct rew_work; -#endif /* CONFIG_RCU_EXP_KTHREAD */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0318a8a062d5..6b83537480b1 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -418,7 +418,6 @@ retry_ipi: static void rcu_exp_sel_wait_wake(unsigned long s); -#ifdef CONFIG_RCU_EXP_KTHREAD static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) { struct rcu_exp_work *rewp = @@ -470,69 +469,6 @@ static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew kthread_queue_work(rcu_exp_gp_kworker, &rew->rew_work); } -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ -} -#else /* !CONFIG_RCU_EXP_KTHREAD */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) -{ - struct rcu_exp_work *rewp = - container_of(wp, struct rcu_exp_work, rew_work); - - __sync_rcu_exp_select_node_cpus(rewp); -} - -static inline bool rcu_exp_worker_started(void) -{ - return !!READ_ONCE(rcu_gp_wq); -} - -static inline bool rcu_exp_par_worker_started(struct rcu_node *rnp) -{ - return !!READ_ONCE(rcu_par_gp_wq); -} - -static inline void sync_rcu_exp_select_cpus_queue_work(struct rcu_node *rnp) -{ - int cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); -} - -static inline void sync_rcu_exp_select_cpus_flush_work(struct rcu_node *rnp) -{ - flush_work(&rnp->rew.rew_work); -} - -/* - * Work-queue handler to drive an expedited grace period forward. - */ -static void wait_rcu_exp_gp(struct work_struct *wp) -{ - struct rcu_exp_work *rewp; - - rewp = container_of(wp, struct rcu_exp_work, rew_work); - rcu_exp_sel_wait_wake(rewp->rew_s); -} - -static inline void synchronize_rcu_expedited_queue_work(struct rcu_exp_work *rew) -{ - INIT_WORK_ONSTACK(&rew->rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew->rew_work); -} - -static inline void synchronize_rcu_expedited_destroy_work(struct rcu_exp_work *rew) -{ - destroy_work_on_stack(&rew->rew_work); -} -#endif /* CONFIG_RCU_EXP_KTHREAD */ - /* * Select the nodes that the upcoming expedited grace period needs * to wait for. @@ -965,7 +901,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - bool use_worker; unsigned long flags; struct rcu_exp_work rew; struct rcu_node *rnp; @@ -976,9 +911,6 @@ void synchronize_rcu_expedited(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); - use_worker = (rcu_scheduler_active != RCU_SCHEDULER_INIT) && - rcu_exp_worker_started(); - /* Is the state is such that the call is a grace period? */ if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. @@ -1008,7 +940,7 @@ void synchronize_rcu_expedited(void) return; /* Someone else did our work for us. */ /* Ensure that load happens before action based on it. */ - if (unlikely(!use_worker)) { + if (unlikely((rcu_scheduler_active == RCU_SCHEDULER_INIT) || !rcu_exp_worker_started())) { /* Direct call during scheduler init and early_initcalls(). */ rcu_exp_sel_wait_wake(s); } else { @@ -1025,9 +957,6 @@ void synchronize_rcu_expedited(void) /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - - if (likely(use_worker)) - synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); From 120311acb01d7360dcc70c0862c83758fbcd28d2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 14:55:37 -0800 Subject: [PATCH 15/34] doc: Spinlocks are implied RCU readers In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections are RCU readers because they disable preemption. However, they are also RCU readers in CONFIG_PREEMPT_RT=y because the -rt locking primitives contain rcu_read_lock() and rcu_read_unlock(). Therefore, upgrade rcu_dereference.rst to document this non-obvious case. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/lkml/CAHk-=whGKvjHCtJ6W4pQ0_h_k9fiFQ8V2GpM=BqYnB2X=SJ+XQ@mail.gmail.com/ Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/rcu_dereference.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Documentation/RCU/rcu_dereference.rst b/Documentation/RCU/rcu_dereference.rst index 659d5913784d..2524dcdadde2 100644 --- a/Documentation/RCU/rcu_dereference.rst +++ b/Documentation/RCU/rcu_dereference.rst @@ -408,7 +408,10 @@ member of the rcu_dereference() to use in various situations: RCU flavors, an RCU read-side critical section is entered using rcu_read_lock(), anything that disables bottom halves, anything that disables interrupts, or anything that disables - preemption. + preemption. Please note that spinlock critical sections + are also implied RCU read-side critical sections, even when + they are preemptible, as they are in kernels built with + CONFIG_PREEMPT_RT=y. 2. If the access might be within an RCU read-side critical section on the one hand, or protected by (say) my_lock on the other, From 8dbc33b4d1a19ff43930dc983c457946241078e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 15:06:46 -0800 Subject: [PATCH 16/34] doc: Make whatisRCU.rst note that spinlocks are RCU readers In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections are RCU readers because they disable preemption. However, they are also RCU readers in CONFIG_PREEMPT_RT=y because in that case the locking primitives contain rcu_read_lock() and rcu_read_unlock(). Therefore, upgrade whatisRCU.rst to document this non-obvious case. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/whatisRCU.rst | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 60ce02475142..246ce0d0b4d1 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -172,14 +172,25 @@ rcu_read_lock() critical section. Reference counts may be used in conjunction with RCU to maintain longer-term references to data structures. + Note that anything that disables bottom halves, preemption, + or interrupts also enters an RCU read-side critical section. + Acquiring a spinlock also enters an RCU read-side critical + sections, even for spinlocks that do not disable preemption, + as is the case in kernels built with CONFIG_PREEMPT_RT=y. + Sleeplocks do *not* enter RCU read-side critical sections. + rcu_read_unlock() ^^^^^^^^^^^^^^^^^ void rcu_read_unlock(void); This temporal primitives is used by a reader to inform the reclaimer that the reader is exiting an RCU read-side critical - section. Note that RCU read-side critical sections may be nested - and/or overlapping. + section. Anything that enables bottom halves, preemption, + or interrupts also exits an RCU read-side critical section. + Releasing a spinlock also exits an RCU read-side critical section. + + Note that RCU read-side critical sections may be nested and/or + overlapping. synchronize_rcu() ^^^^^^^^^^^^^^^^^ From 3cf501612108b8a7a4cebf8a6ac1d7575080c88f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 16:23:56 -0800 Subject: [PATCH 17/34] doc: Make checklist.rst note that spinlocks are implied RCU readers In kernels built with CONFIG_PREEMPT_RT=n, spinlock critical sections are RCU readers because they disable preemption. However, they are also RCU readers in CONFIG_PREEMPT_RT=y because in that case the locking primitives contain rcu_read_lock() and rcu_read_unlock(). Therefore, upgrade checklist.rst to document this non-obvious case. While in the area, fix a typo by changing "read-side critical" to "read-side critical section". Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/checklist.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 2d42998a89a6..98a622f77248 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -68,7 +68,8 @@ over a rather long period of time, but improvements are always welcome! rcu_read_lock_sched(), or by the appropriate update-side lock. Explicit disabling of preemption (preempt_disable(), for example) can serve as rcu_read_lock_sched(), but is less readable and - prevents lockdep from detecting locking issues. + prevents lockdep from detecting locking issues. Acquiring a + spinlock also enters an RCU read-side critical section. Please note that you *cannot* rely on code known to be built only in non-preemptible kernels. Such code can and will break, @@ -444,7 +445,7 @@ over a rather long period of time, but improvements are always welcome! real-time workloads than is synchronize_rcu_expedited(). It is also permissible to sleep in RCU Tasks Trace read-side - critical, which are delimited by rcu_read_lock_trace() and + critical section, which are delimited by rcu_read_lock_trace() and rcu_read_unlock_trace(). However, this is a specialized flavor of RCU, and you should not use it without first checking with its current users. In most cases, you should instead use SRCU. From 739337d482f12b9eff062586ba64f008fcc6efba Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 24 Nov 2023 16:29:01 -0800 Subject: [PATCH 18/34] doc: Add CONFIG_RCU_STRICT_GRACE_PERIOD to checklist.rst This commit adds CONFIG_RCU_STRICT_GRACE_PERIOD to the list of debugging Kconfig options in checklist.rst. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/checklist.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 98a622f77248..addd5c1547a4 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -491,6 +491,12 @@ over a rather long period of time, but improvements are always welcome! since the last time that you passed that same object to call_rcu() (or friends). + CONFIG_RCU_STRICT_GRACE_PERIOD: + combine with KASAN to check for pointers leaked out + of RCU read-side critical sections. This Kconfig + option is tough on both performance and scalability, + and so is limited to four-CPU systems. + __rcu sparse checks: tag the pointer to the RCU-protected data structure with __rcu, and sparse will warn you if you access that From 600716592a3a6de8bfcf3a0625d75cda8dce3ced Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 26 Nov 2023 11:06:10 -0800 Subject: [PATCH 19/34] doc: Add EARLY flag to early-parsed kernel boot parameters Kernel boot parameters declared with early_param() are parsed before embedded parameters are extracted from initrd, and early_param() parameters are not helpful when embedded in initrd. Therefore, mark early_param() kernel boot parameters with "EARLY" in kernel-parameters.txt. The following early_param() calls declare kernel boot parameters that are undocumented: early_param("atmel.pm_modes", at91_pm_modes_select); early_param("mem_fclk_21285", early_fclk); early_param("ecc", early_ecc); early_param("cachepolicy", early_cachepolicy); early_param("nodebugmon", early_debug_disable); early_param("kfence.sample_interval", parse_kfence_early_init); early_param("additional_cpus", setup_additional_cpus); early_param("stram_pool", atari_stram_setup); early_param("disable_octeon_edac", disable_octeon_edac); early_param("rd_start", rd_start_early); early_param("rd_size", rd_size_early); early_param("coherentio", setcoherentio); early_param("nocoherentio", setnocoherentio); early_param("fadump", early_fadump_param); early_param("fadump_reserve_mem", early_fadump_reserve_mem); early_param("no_stf_barrier", handle_no_stf_barrier); early_param("no_rfi_flush", handle_no_rfi_flush); early_param("smt-enabled", early_smt_enabled); early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup); early_param("ps3fb", early_parse_ps3fb); early_param("ps3flash", early_parse_ps3flash); early_param("novx", disable_vector_extension); early_param("nobp", nobp_setup_early); early_param("nospec", nospec_setup_early); early_param("possible_cpus", _setup_possible_cpus); early_param("stp", early_parse_stp); early_param("nopfault", nopfault); early_param("nmi_mode", nmi_mode_setup); early_param("sh_mv", early_parse_mv); early_param("pmb", early_pmb); early_param("hvirq", early_hvirq_major); early_param("cfi", cfi_parse_cmdline); early_param("disableapic", setup_disableapic); early_param("noapictimer", parse_disable_apic_timer); early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); early_param("uv_memblksize", parse_mem_block_size); early_param("retbleed", retbleed_parse_cmdline); early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); early_param("update_mptable", update_mptable_setup); early_param("alloc_mptable", parse_alloc_mptable_opt); early_param("possible_cpus", _setup_possible_cpus); early_param("lsmsi", early_parse_ls_scfg_msi); early_param("nokgdbroundup", opt_nokgdbroundup); early_param("kgdbcon", opt_kgdb_con); early_param("kasan", early_kasan_flag); early_param("kasan.mode", early_kasan_mode); early_param("kasan.vmalloc", early_kasan_flag_vmalloc); early_param("kasan.page_alloc.sample", early_kasan_flag_page_alloc_sample); early_param("kasan.page_alloc.sample.order", early_kasan_flag_page_alloc_sample_order); early_param("kasan.fault", early_kasan_fault); early_param("kasan.stacktrace", early_kasan_flag_stacktrace); early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); early_param("accept_memory", accept_memory_parse); early_param("page_table_check", early_page_table_check_param); sh_early_platform_init("earlytimer", &sh_cmt_device_driver); early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES); These are not necessarily bugs, given that some kernel boot parameters are intended for deep debugging rather than general use. This work does not cover all of the kernel boot parameters declared using cmdline_find_option() and cmdline_find_option_bool(). If these are in fact guaranteed to be early (which appears to be the case), they can be added in a later version of this patch. Signed-off-by: Paul E. McKenney Cc: Jonathan Corbet Cc: Masami Hiramatsu Cc: Petr Malat Cc: Randy Dunlap Cc: Cc: Signed-off-by: Boqun Feng --- .../admin-guide/kernel-parameters.rst | 1 + .../admin-guide/kernel-parameters.txt | 484 +++++++++--------- 2 files changed, 250 insertions(+), 235 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 4410384596a9..e8bdf5e86a9b 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -108,6 +108,7 @@ is applicable:: CMA Contiguous Memory Area support is enabled. DRM Direct Rendering Management support is enabled. DYNAMIC_DEBUG Build in debug messages and enable them at runtime + EARLY Parameter processed too early to be embedded in initrd. EDD BIOS Enhanced Disk Drive Services (EDD) is enabled EFI EFI Partitioning (GPT) is enabled EVM Extended Verification Module diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 31b3a25680d0..4839f2919fdf 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -9,7 +9,7 @@ accept_memory=eager can be used to accept all memory at once during boot. - acpi= [HW,ACPI,X86,ARM64,RISCV64] + acpi= [HW,ACPI,X86,ARM64,RISCV64,EARLY] Advanced Configuration and Power Interface Format: { force | on | off | strict | noirq | rsdt | copy_dsdt } @@ -26,7 +26,7 @@ See also Documentation/power/runtime_pm.rst, pci=noacpi - acpi_apic_instance= [ACPI, IOAPIC] + acpi_apic_instance= [ACPI,IOAPIC,EARLY] Format: 2: use 2nd APIC table, if available 1,0: use 1st APIC table @@ -41,7 +41,7 @@ If set to native, use the device's native backlight mode. If set to none, disable the ACPI backlight interface. - acpi_force_32bit_fadt_addr + acpi_force_32bit_fadt_addr [ACPI,EARLY] force FADT to use 32 bit addresses rather than the 64 bit X_* addresses. Some firmware have broken 64 bit addresses for force ACPI ignore these and use @@ -97,7 +97,7 @@ no: ACPI OperationRegions are not marked as reserved, no further checks are performed. - acpi_force_table_verification [HW,ACPI] + acpi_force_table_verification [HW,ACPI,EARLY] Enable table checksum verification during early stage. By default, this is disabled due to x86 early mapping size limitation. @@ -137,7 +137,7 @@ acpi_no_memhotplug [ACPI] Disable memory hotplug. Useful for kdump kernels. - acpi_no_static_ssdt [HW,ACPI] + acpi_no_static_ssdt [HW,ACPI,EARLY] Disable installation of static SSDTs at early boot time By default, SSDTs contained in the RSDT/XSDT will be installed automatically and they will appear under @@ -151,7 +151,7 @@ Ignore the ACPI-based watchdog interface (WDAT) and let a native driver control the watchdog device instead. - acpi_rsdp= [ACPI,EFI,KEXEC] + acpi_rsdp= [ACPI,EFI,KEXEC,EARLY] Pass the RSDP address to the kernel, mostly used on machines running EFI runtime service to boot the second kernel for kdump. @@ -228,10 +228,10 @@ to assume that this machine's pmtimer latches its value and always returns good values. - acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode + acpi_sci= [HW,ACPI,EARLY] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } - acpi_skip_timer_override [HW,ACPI] + acpi_skip_timer_override [HW,ACPI,EARLY] Recognize and ignore IRQ0/pin2 Interrupt Override. For broken nForce2 BIOS resulting in XT-PIC timer. @@ -266,11 +266,11 @@ behave incorrectly in some ways with respect to system suspend and resume to be ignored (use wisely). - acpi_use_timer_override [HW,ACPI] + acpi_use_timer_override [HW,ACPI,EARLY] Use timer override. For some broken Nvidia NF5 boards that require a timer override, but don't have HPET - add_efi_memmap [EFI; X86] Include EFI memory map in + add_efi_memmap [EFI,X86,EARLY] Include EFI memory map in kernel's map of available physical RAM. agp= [AGP] @@ -307,7 +307,7 @@ do not want to use tracing_snapshot_alloc() as it needs to be done where GFP_KERNEL allocations are allowed. - allow_mismatched_32bit_el0 [ARM64] + allow_mismatched_32bit_el0 [ARM64,EARLY] Allow execve() of 32-bit applications and setting of the PER_LINUX32 personality on systems where only a strict subset of the CPUs support 32-bit EL0. When this @@ -351,7 +351,7 @@ This mode requires kvm-amd.avic=1. (Default when IOMMU HW support is present.) - amd_pstate= [X86] + amd_pstate= [X86,EARLY] disable Do not enable amd_pstate as the default scaling driver for the supported processors @@ -391,7 +391,7 @@ not play well with APC CPU idle - disable it if you have APC and your system crashes randomly. - apic= [APIC,X86] Advanced Programmable Interrupt Controller + apic= [APIC,X86,EARLY] Advanced Programmable Interrupt Controller Change the output verbosity while booting Format: { quiet (default) | verbose | debug } Change the amount of debugging information output @@ -401,7 +401,7 @@ Format: apic=driver_name Examples: apic=bigsmp - apic_extnmi= [APIC,X86] External NMI delivery setting + apic_extnmi= [APIC,X86,EARLY] External NMI delivery setting Format: { bsp (default) | all | none } bsp: External NMI is delivered only to CPU 0 all: External NMIs are broadcast to all CPUs as a @@ -508,21 +508,22 @@ bert_disable [ACPI] Disable BERT OS support on buggy BIOSes. - bgrt_disable [ACPI][X86] + bgrt_disable [ACPI,X86,EARLY] Disable BGRT to avoid flickering OEM logo. blkdevparts= Manual partition parsing of block device(s) for embedded devices based on command line input. See Documentation/block/cmdline-partition.rst - boot_delay= Milliseconds to delay each printk during boot. + boot_delay= [KNL,EARLY] + Milliseconds to delay each printk during boot. Only works if CONFIG_BOOT_PRINTK_DELAY is enabled, and you may also have to specify "lpj=". Boot_delay values larger than 10 seconds (10000) are assumed erroneous and ignored. Format: integer - bootconfig [KNL] + bootconfig [KNL,EARLY] Extended command line options can be added to an initrd and this will cause the kernel to look for it. @@ -557,7 +558,7 @@ trust validation. format: { id: | builtin } - cca= [MIPS] Override the kernel pages' cache coherency + cca= [MIPS,EARLY] Override the kernel pages' cache coherency algorithm. Accepted values range from 0 to 7 inclusive. See arch/mips/include/asm/pgtable-bits.h for platform specific values (SB1, Loongson3 and @@ -672,7 +673,7 @@ [X86-64] hpet,tsc clocksource.arm_arch_timer.evtstrm= - [ARM,ARM64] + [ARM,ARM64,EARLY] Format: Enable/disable the eventstream feature of the ARM architected timer so that code using WFE-based polling @@ -702,7 +703,7 @@ 10 seconds when built into the kernel. cma=nn[MG]@[start[MG][-end[MG]]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel global memory area for contiguous memory allocations and optionally the placement constraint by the physical address range of @@ -711,7 +712,7 @@ kernel/dma/contiguous.c cma_pernuma=nn[MG] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel per-numa memory area for contiguous memory allocations. A value of 0 disables per-numa CMA altogether. And If this option is not @@ -722,7 +723,7 @@ they will fallback to the global default memory area. numa_cma=:nn[MG][,:nn[MG]] - [KNL,CMA] + [KNL,CMA,EARLY] Sets the size of kernel numa memory area for contiguous memory allocations. It will reserve CMA area for the specified node. @@ -739,7 +740,7 @@ a hypervisor. Default: yes - coherent_pool=nn[KMG] [ARM,KNL] + coherent_pool=nn[KMG] [ARM,KNL,EARLY] Sets the size of memory pool for coherent, atomic dma allocations, by default set to 256K. @@ -757,7 +758,7 @@ condev= [HW,S390] console device conmode= - con3215_drop= [S390] 3215 console drop mode. + con3215_drop= [S390,EARLY] 3215 console drop mode. Format: y|n|Y|N|1|0 When set to true, drop data on the 3215 console when the console buffer is full. In this case the @@ -863,7 +864,7 @@ kernel before the cpufreq driver probes. cpu_init_udelay=N - [X86] Delay for N microsec between assert and de-assert + [X86,EARLY] Delay for N microsec between assert and de-assert of APIC INIT to start processors. This delay occurs on every CPU online, such as boot, and resume from suspend. Default: 10000 @@ -883,7 +884,7 @@ kernel more unstable. crashkernel=size[KMG][@offset[KMG]] - [KNL] Using kexec, Linux can switch to a 'crash kernel' + [KNL,EARLY] Using kexec, Linux can switch to a 'crash kernel' upon panic. This parameter reserves the physical memory region [offset, offset + size] for that kernel image. If '@offset' is omitted, then a suitable offset @@ -954,10 +955,10 @@ Format: , See also Documentation/input/devices/joystick-parport.rst - debug [KNL] Enable kernel debugging (events log level). + debug [KNL,EARLY] Enable kernel debugging (events log level). debug_boot_weak_hash - [KNL] Enable printing [hashed] pointers early in the + [KNL,EARLY] Enable printing [hashed] pointers early in the boot sequence. If enabled, we use a weak hash instead of siphash to hash pointers. Use this option if you are seeing instances of '(___ptrval___)') and need to see a @@ -974,10 +975,10 @@ will print _a_lot_ more information - normally only useful to lockdep developers. - debug_objects [KNL] Enable object debugging + debug_objects [KNL,EARLY] Enable object debugging debug_guardpage_minorder= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter allows control of the order of pages that will be intentionally kept free (and hence protected) by the buddy allocator. Bigger value increase the probability @@ -996,7 +997,7 @@ help tracking down these problems. debug_pagealloc= - [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this parameter + [KNL,EARLY] When CONFIG_DEBUG_PAGEALLOC is set, this parameter enables the feature at boot time. By default, it is disabled and the system will work mostly the same as a kernel built without CONFIG_DEBUG_PAGEALLOC. @@ -1004,8 +1005,8 @@ useful to also enable the page_owner functionality. on: enable the feature - debugfs= [KNL] This parameter enables what is exposed to userspace - and debugfs internal clients. + debugfs= [KNL,EARLY] This parameter enables what is exposed to + userspace and debugfs internal clients. Format: { on, no-mount, off } on: All functions are enabled. no-mount: @@ -1084,7 +1085,7 @@ dhash_entries= [KNL] Set number of hash buckets for dentry cache. - disable_1tb_segments [PPC] + disable_1tb_segments [PPC,EARLY] Disables the use of 1TB hash page table segments. This causes the kernel to fall back to 256MB segments which can be useful when debugging issues that require an SLB @@ -1093,7 +1094,7 @@ disable= [IPV6] See Documentation/networking/ipv6.rst. - disable_radix [PPC] + disable_radix [PPC,EARLY] Disable RADIX MMU mode on POWER9 disable_tlbie [PPC] @@ -1109,25 +1110,25 @@ causing system reset or hang due to sending INIT from AP to BSP. - disable_ddw [PPC/PSERIES] + disable_ddw [PPC/PSERIES,EARLY] Disable Dynamic DMA Window support. Use this to workaround buggy firmware. disable_ipv6= [IPV6] See Documentation/networking/ipv6.rst. - disable_mtrr_cleanup [X86] + disable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter disables that. - disable_mtrr_trim [X86, Intel and AMD only] + disable_mtrr_trim [X86, Intel and AMD only,EARLY] By default the kernel will trim any uncacheable memory out of your available memory pool based on MTRR settings. This parameter disables that behavior, possibly causing your machine to run very slowly. - disable_timer_pin_1 [X86] + disable_timer_pin_1 [X86,EARLY] Disable PIN 1 of APIC timer Can be useful to work around chipset bugs. @@ -1177,7 +1178,7 @@ dscc4.setup= [NET] - dt_cpu_ftrs= [PPC] + dt_cpu_ftrs= [PPC,EARLY] Format: {"off" | "known"} Control how the dt_cpu_ftrs device-tree binding is used for CPU feature discovery and setup (if it @@ -1197,12 +1198,12 @@ Documentation/admin-guide/dynamic-debug-howto.rst for details. - early_ioremap_debug [KNL] + early_ioremap_debug [KNL,EARLY] Enable debug messages in early_ioremap support. This is useful for tracking down temporary early mappings which are not unmapped. - earlycon= [KNL] Output early console device and options. + earlycon= [KNL,EARLY] Output early console device and options. When used with no options, the early console is determined by stdout-path property in device tree's @@ -1338,7 +1339,7 @@ address must be provided, and the serial port must already be setup and configured. - earlyprintk= [X86,SH,ARM,M68k,S390] + earlyprintk= [X86,SH,ARM,M68k,S390,UM,EARLY] earlyprintk=vga earlyprintk=sclp earlyprintk=xen @@ -1396,7 +1397,7 @@ edd= [EDD] Format: {"off" | "on" | "skip[mbr]"} - efi= [EFI] + efi= [EFI,EARLY] Format: { "debug", "disable_early_pci_dma", "nochunk", "noruntime", "nosoftreserve", "novamap", "no_disable_early_pci_dma" } @@ -1417,13 +1418,13 @@ no_disable_early_pci_dma: Leave the busmaster bit set on all PCI bridges while in the EFI boot stub - efi_no_storage_paranoia [EFI; X86] + efi_no_storage_paranoia [EFI,X86,EARLY] Using this parameter you can use more than 50% of your efi variable storage. Use this parameter only if you are really sure that your UEFI does sane gc and fulfills the spec otherwise your board may brick. - efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI; X86] + efi_fake_mem= nn[KMG]@ss[KMG]:aa[,nn[KMG]@ss[KMG]:aa,..] [EFI,X86,EARLY] Add arbitrary attribute to specific memory range by updating original EFI memory map. Region of memory which aa attribute is added to is @@ -1454,7 +1455,7 @@ eisa_irq_edge= [PARISC,HW] See header of drivers/parisc/eisa.c. - ekgdboc= [X86,KGDB] Allow early kernel console debugging + ekgdboc= [X86,KGDB,EARLY] Allow early kernel console debugging Format: ekgdboc=kbd This is designed to be used in conjunction with @@ -1469,13 +1470,13 @@ See comment before function elanfreq_setup() in arch/x86/kernel/cpu/cpufreq/elanfreq.c. - elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390] + elfcorehdr=[size[KMG]@]offset[KMG] [PPC,SH,X86,S390,EARLY] Specifies physical address of start of kernel core image elf header and optionally the size. Generally kexec loader will pass this option to capture kernel. See Documentation/admin-guide/kdump/kdump.rst for details. - enable_mtrr_cleanup [X86] + enable_mtrr_cleanup [X86,EARLY] The kernel tries to adjust MTRR layout from continuous to discrete, to make X server driver able to add WB entry later. This parameter enables that. @@ -1508,7 +1509,7 @@ Permit 'security.evm' to be updated regardless of current integrity status. - early_page_ext [KNL] Enforces page_ext initialization to earlier + early_page_ext [KNL,EARLY] Enforces page_ext initialization to earlier stages so cover more early boot allocations. Please note that as side effect some optimizations might be disabled to achieve that (e.g. parallelized @@ -1600,7 +1601,7 @@ can be changed at run time by the max_graph_depth file in the tracefs tracing directory. default: 0 (no limit) - fw_devlink= [KNL] Create device links between consumer and supplier + fw_devlink= [KNL,EARLY] Create device links between consumer and supplier devices by scanning the firmware to infer the consumer/supplier relationships. This feature is especially useful when drivers are loaded as modules as @@ -1619,12 +1620,12 @@ rpm -- Like "on", but also use to order runtime PM. fw_devlink.strict= - [KNL] Treat all inferred dependencies as mandatory + [KNL,EARLY] Treat all inferred dependencies as mandatory dependencies. This only applies for fw_devlink=on|rpm. Format: fw_devlink.sync_state = - [KNL] When all devices that could probe have finished + [KNL,EARLY] When all devices that could probe have finished probing, this parameter controls what to do with devices that haven't yet received their sync_state() calls. @@ -1645,12 +1646,12 @@ gamma= [HW,DRM] - gart_fix_e820= [X86-64] disable the fix e820 for K8 GART + gart_fix_e820= [X86-64,EARLY] disable the fix e820 for K8 GART Format: off | on default: on gather_data_sampling= - [X86,INTEL] Control the Gather Data Sampling (GDS) + [X86,INTEL,EARLY] Control the Gather Data Sampling (GDS) mitigation. Gather Data Sampling is a hardware vulnerability which @@ -1748,7 +1749,7 @@ (that will set all pages holding image data during restoration read-only). - highmem=nn[KMG] [KNL,BOOT] forces the highmem zone to have an exact + highmem=nn[KMG] [KNL,BOOT,EARLY] forces the highmem zone to have an exact size of . This works even on boxes that have no highmem otherwise. This also works to reduce highmem size on bigger boxes. @@ -1759,7 +1760,7 @@ hlt [BUGS=ARM,SH] - hostname= [KNL] Set the hostname (aka UTS nodename). + hostname= [KNL,EARLY] Set the hostname (aka UTS nodename). Format: This allows setting the system's hostname during early startup. This sets the name returned by gethostname. @@ -1804,7 +1805,7 @@ Documentation/admin-guide/mm/hugetlbpage.rst. Format: size[KMG] - hugetlb_cma= [HW,CMA] The size of a CMA area used for allocation + hugetlb_cma= [HW,CMA,EARLY] The size of a CMA area used for allocation of gigantic hugepages. Or using node format, the size of a CMA area per node can be specified. Format: nn[KMGTPE] or (node format) @@ -1850,9 +1851,10 @@ If specified, z/VM IUCV HVC accepts connections from listed z/VM user IDs only. - hv_nopvspin [X86,HYPER_V] Disables the paravirt spinlock optimizations - which allow the hypervisor to 'idle' the - guest on lock contention. + hv_nopvspin [X86,HYPER_V,EARLY] + Disables the paravirt spinlock optimizations + which allow the hypervisor to 'idle' the guest + on lock contention. i2c_bus= [HW] Override the default board specific I2C bus speed or register an additional I2C bus that is not @@ -1917,7 +1919,7 @@ Format: [,[,[,]]] - idle= [X86] + idle= [X86,EARLY] Format: idle=poll, idle=halt, idle=nomwait Poll forces a polling idle loop that can slightly improve the performance of waking up a idle CPU, but @@ -1973,7 +1975,7 @@ mode generally follows that for the NaN encoding, except where unsupported by hardware. - ignore_loglevel [KNL] + ignore_loglevel [KNL,EARLY] Ignore loglevel setting - this will print /all/ kernel messages to the console. Useful for debugging. We also add it as printk module parameter, so users @@ -2091,21 +2093,21 @@ unpacking being completed before device_ and late_ initcalls. - initrd= [BOOT] Specify the location of the initial ramdisk + initrd= [BOOT,EARLY] Specify the location of the initial ramdisk - initrdmem= [KNL] Specify a physical address and size from which to + initrdmem= [KNL,EARLY] Specify a physical address and size from which to load the initrd. If an initrd is compiled in or specified in the bootparams, it takes priority over this setting. Format: ss[KMG],nn[KMG] Default is 0, 0 - init_on_alloc= [MM] Fill newly allocated pages and heap objects with + init_on_alloc= [MM,EARLY] Fill newly allocated pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_ALLOC_DEFAULT_ON. - init_on_free= [MM] Fill freed pages and heap objects with zeroes. + init_on_free= [MM,EARLY] Fill freed pages and heap objects with zeroes. Format: 0 | 1 Default set by CONFIG_INIT_ON_FREE_DEFAULT_ON. @@ -2161,7 +2163,7 @@ 0 disables intel_idle and fall back on acpi_idle. 1 to 9 specify maximum depth of C-state. - intel_pstate= [X86] + intel_pstate= [X86,EARLY] disable Do not enable intel_pstate as the default scaling driver for the supported processors @@ -2205,7 +2207,7 @@ Allow per-logical-CPU P-State performance control limits using cpufreq sysfs interface - intremap= [X86-64, Intel-IOMMU] + intremap= [X86-64,Intel-IOMMU,EARLY] on enable Interrupt Remapping (default) off disable Interrupt Remapping nosid disable Source ID checking @@ -2217,7 +2219,7 @@ strict regions from userspace. relaxed - iommu= [X86] + iommu= [X86,EARLY] off force noforce @@ -2232,7 +2234,7 @@ nobypass [PPC/POWERNV] Disable IOMMU bypass, using IOMMU for PCI devices. - iommu.forcedac= [ARM64, X86] Control IOVA allocation for PCI devices. + iommu.forcedac= [ARM64,X86,EARLY] Control IOVA allocation for PCI devices. Format: { "0" | "1" } 0 - Try to allocate a 32-bit DMA address first, before falling back to the full range if needed. @@ -2240,7 +2242,7 @@ forcing Dual Address Cycle for PCI cards supporting greater than 32-bit addressing. - iommu.strict= [ARM64, X86, S390] Configure TLB invalidation behaviour + iommu.strict= [ARM64,X86,S390,EARLY] Configure TLB invalidation behaviour Format: { "0" | "1" } 0 - Lazy mode. Request that DMA unmap operations use deferred @@ -2256,7 +2258,7 @@ legacy driver-specific options takes precedence. iommu.passthrough= - [ARM64, X86] Configure DMA to bypass the IOMMU by default. + [ARM64,X86,EARLY] Configure DMA to bypass the IOMMU by default. Format: { "0" | "1" } 0 - Use IOMMU translation for DMA. 1 - Bypass the IOMMU for DMA. @@ -2266,7 +2268,7 @@ See comment before marvel_specify_io7 in arch/alpha/kernel/core_marvel.c. - io_delay= [X86] I/O delay method + io_delay= [X86,EARLY] I/O delay method 0x80 Standard port 0x80 based delay 0xed @@ -2279,28 +2281,28 @@ ip= [IP_PNP] See Documentation/admin-guide/nfs/nfsroot.rst. - ipcmni_extend [KNL] Extend the maximum number of unique System V + ipcmni_extend [KNL,EARLY] Extend the maximum number of unique System V IPC identifiers from 32,768 to 16,777,216. irqaffinity= [SMP] Set the default irq affinity mask The argument is a cpu list, as described above. irqchip.gicv2_force_probe= - [ARM, ARM64] + [ARM,ARM64,EARLY] Format: Force the kernel to look for the second 4kB page of a GICv2 controller even if the memory range exposed by the device tree is too small. irqchip.gicv3_nolpi= - [ARM, ARM64] + [ARM,ARM64,EARLY] Force the kernel to ignore the availability of LPIs (and by consequence ITSs). Intended for system that use the kernel as a bootloader, and thus want to let secondary kernels in charge of setting up LPIs. - irqchip.gicv3_pseudo_nmi= [ARM64] + irqchip.gicv3_pseudo_nmi= [ARM64,EARLY] Enables support for pseudo-NMIs in the kernel. This requires the kernel to be built with CONFIG_ARM64_PSEUDO_NMI. @@ -2445,7 +2447,7 @@ parameter KASAN will print report only for the first invalid access. - keep_bootcon [KNL] + keep_bootcon [KNL,EARLY] Do not unregister boot console at start. This is only useful for debugging when something happens in the window between unregistering the boot console and initializing @@ -2453,7 +2455,7 @@ keepinitrd [HW,ARM] See retain_initrd. - kernelcore= [KNL,X86,IA-64,PPC] + kernelcore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% | "mirror" This parameter specifies the amount of memory usable by the kernel for non-movable allocations. The requested @@ -2478,7 +2480,7 @@ for Movable pages. "nn[KMGTPE]", "nn%", and "mirror" are exclusive, so you cannot specify multiple forms. - kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. + kgdbdbgp= [KGDB,HW,EARLY] kgdb over EHCI usb debug port. Format: [,poll interval] The controller # is the number of the ehci usb debug port as it is probed via PCI. The poll interval is @@ -2499,7 +2501,7 @@ kms, kbd format: kms,kbd kms, kbd and serial format: kms,kbd,[,baud] - kgdboc_earlycon= [KGDB,HW] + kgdboc_earlycon= [KGDB,HW,EARLY] If the boot console provides the ability to read characters and can work in polling mode, you can use this parameter to tell kgdb to use it as a backend @@ -2514,14 +2516,14 @@ blank and the first boot console that implements read() will be picked. - kgdbwait [KGDB] Stop kernel execution and enter the + kgdbwait [KGDB,EARLY] Stop kernel execution and enter the kernel debugger at the earliest opportunity. kmac= [MIPS] Korina ethernet MAC address. Configure the RouterBoard 532 series on-chip Ethernet adapter MAC address. - kmemleak= [KNL] Boot-time kmemleak enable/disable + kmemleak= [KNL,EARLY] Boot-time kmemleak enable/disable Valid arguments: on, off Default: on Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, @@ -2540,8 +2542,8 @@ See also Documentation/trace/kprobetrace.rst "Kernel Boot Parameter" section. - kpti= [ARM64] Control page table isolation of user - and kernel address spaces. + kpti= [ARM64,EARLY] Control page table isolation of + user and kernel address spaces. Default: enabled on cores which need mitigation. 0: force disabled 1: force enabled @@ -2618,7 +2620,8 @@ for NPT. kvm-arm.mode= - [KVM,ARM] Select one of KVM/arm64's modes of operation. + [KVM,ARM,EARLY] Select one of KVM/arm64's modes of + operation. none: Forcefully disable KVM. @@ -2638,22 +2641,22 @@ used with extreme caution. kvm-arm.vgic_v3_group0_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-0 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0 system registers kvm-arm.vgic_v3_group1_trap= - [KVM,ARM] Trap guest accesses to GICv3 group-1 + [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-1 system registers kvm-arm.vgic_v3_common_trap= - [KVM,ARM] Trap guest accesses to GICv3 common + [KVM,ARM,EARLY] Trap guest accesses to GICv3 common system registers kvm-arm.vgic_v4_enable= - [KVM,ARM] Allow use of GICv4 for direct injection of - LPIs. + [KVM,ARM,EARLY] Allow use of GICv4 for direct + injection of LPIs. - kvm_cma_resv_ratio=n [PPC] + kvm_cma_resv_ratio=n [PPC,EARLY] Reserves given percentage from system memory area for contiguous memory allocation for KVM hash pagetable allocation. @@ -2706,7 +2709,7 @@ (enabled). Disable by KVM if hardware lacks support for it. - l1d_flush= [X86,INTEL] + l1d_flush= [X86,INTEL,EARLY] Control mitigation for L1D based snooping vulnerability. Certain CPUs are vulnerable to an exploit against CPU @@ -2723,7 +2726,7 @@ on - enable the interface for the mitigation - l1tf= [X86] Control mitigation of the L1TF vulnerability on + l1tf= [X86,EARLY] Control mitigation of the L1TF vulnerability on affected CPUs The kernel PTE inversion protection is unconditionally @@ -2792,7 +2795,7 @@ l3cr= [PPC] - lapic [X86-32,APIC] Enable the local APIC even if BIOS + lapic [X86-32,APIC,EARLY] Enable the local APIC even if BIOS disabled it. lapic= [X86,APIC] Do not use TSC deadline @@ -2800,7 +2803,7 @@ back to the programmable timer unit in the LAPIC. Format: notscdeadline - lapic_timer_c2_ok [X86,APIC] trust the local apic timer + lapic_timer_c2_ok [X86,APIC,EARLY] trust the local apic timer in C2 power state. libata.dma= [LIBATA] DMA control @@ -2924,7 +2927,7 @@ lockd.nlm_udpport=M [NFS] Assign UDP port. Format: - lockdown= [SECURITY] + lockdown= [SECURITY,EARLY] { integrity | confidentiality } Enable the kernel lockdown feature. If set to integrity, kernel features that allow userland to @@ -3031,7 +3034,8 @@ logibm.irq= [HW,MOUSE] Logitech Bus Mouse Driver Format: - loglevel= All Kernel Messages with a loglevel smaller than the + loglevel= [KNL,EARLY] + All Kernel Messages with a loglevel smaller than the console loglevel will be printed to the console. It can also be changed with klogd or other programs. The loglevels are defined as follows: @@ -3045,13 +3049,15 @@ 6 (KERN_INFO) informational 7 (KERN_DEBUG) debug-level messages - log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two and greater - than the minimal size. The minimal size is defined - by LOG_BUF_SHIFT kernel config parameter. There is - also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter - that allows to increase the default size depending on - the number of CPUs. See init/Kconfig for more details. + log_buf_len=n[KMG] [KNL,EARLY] + Sets the size of the printk ring buffer, in bytes. + n must be a power of two and greater than the + minimal size. The minimal size is defined by + LOG_BUF_SHIFT kernel config parameter. There + is also CONFIG_LOG_CPU_MAX_BUF_SHIFT config + parameter that allows to increase the default size + depending on the number of CPUs. See init/Kconfig + for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for @@ -3109,7 +3115,7 @@ max_addr=nn[KMG] [KNL,BOOT,IA-64] All physical memory greater than or equal to this physical address is ignored. - maxcpus= [SMP] Maximum number of processors that an SMP kernel + maxcpus= [SMP,EARLY] Maximum number of processors that an SMP kernel will bring up during bootup. maxcpus=n : n >= 0 limits the kernel to bring up 'n' processors. Surely after bootup you can bring up the other plugged cpu by executing @@ -3136,7 +3142,7 @@ Format: , Specifies range of consoles to be captured by the MDA. - mds= [X86,INTEL] + mds= [X86,INTEL,EARLY] Control mitigation for the Micro-architectural Data Sampling (MDS) vulnerability. @@ -3168,11 +3174,12 @@ For details see: Documentation/admin-guide/hw-vuln/mds.rst - mem=nn[KMG] [HEXAGON] Set the memory size. + mem=nn[KMG] [HEXAGON,EARLY] Set the memory size. Must be specified, otherwise memory size will be 0. - mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory - Amount of memory to be used in cases as follows: + mem=nn[KMG] [KNL,BOOT,EARLY] Force usage of a specific amount + of memory Amount of memory to be used in cases + as follows: 1 for test; 2 when the kernel is not able to see the whole system memory; @@ -3196,8 +3203,8 @@ if system memory of hypervisor is not sufficient. mem=nn[KMG]@ss[KMG] - [ARM,MIPS] - override the memory layout reported by - firmware. + [ARM,MIPS,EARLY] - override the memory layout + reported by firmware. Define a memory region of size nn[KMG] starting at ss[KMG]. Multiple different regions can be specified with @@ -3206,7 +3213,7 @@ mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel memory. - memblock=debug [KNL] Enable memblock debug messages. + memblock=debug [KNL,EARLY] Enable memblock debug messages. memchunk=nn[KMG] [KNL,SH] Allow user to override the default size for @@ -3220,14 +3227,14 @@ option. See Documentation/admin-guide/mm/memory-hotplug.rst. - memmap=exactmap [KNL,X86] Enable setting of an exact + memmap=exactmap [KNL,X86,EARLY] Enable setting of an exact E820 memory map, as specified by the user. Such memmap=exactmap lines can be constructed based on BIOS output or other requirements. See the memmap=nn@ss option description. memmap=nn[KMG]@ss[KMG] - [KNL, X86, MIPS, XTENSA] Force usage of a specific region of memory. + [KNL, X86,MIPS,XTENSA,EARLY] Force usage of a specific region of memory. Region of memory to be used is from ss to ss+nn. If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG], which limits max address to nn[KMG]. @@ -3237,11 +3244,11 @@ memmap=100M@2G,100M#3G,1G!1024G memmap=nn[KMG]#ss[KMG] - [KNL,ACPI] Mark specific memory as ACPI data. + [KNL,ACPI,EARLY] Mark specific memory as ACPI data. Region of memory to be marked is from ss to ss+nn. memmap=nn[KMG]$ss[KMG] - [KNL,ACPI] Mark specific memory as reserved. + [KNL,ACPI,EARLY] Mark specific memory as reserved. Region of memory to be reserved is from ss to ss+nn. Example: Exclude memory from 0x18690000-0x1869ffff memmap=64K$0x18690000 @@ -3251,14 +3258,14 @@ like Grub2, otherwise '$' and the following number will be eaten. - memmap=nn[KMG]!ss[KMG] + memmap=nn[KMG]!ss[KMG,EARLY] [KNL,X86] Mark specific memory as protected. Region of memory to be used, from ss to ss+nn. The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. memmap=%-+ - [KNL,ACPI] Convert memory within the specified region + [KNL,ACPI,EARLY] Convert memory within the specified region from to . If "-" is left out, the whole region will be marked as , even if previously unavailable. If "+" is left @@ -3266,7 +3273,7 @@ specified as e820 types, e.g., 1 = RAM, 2 = reserved, 3 = ACPI, 12 = PRAM. - memory_corruption_check=0/1 [X86] + memory_corruption_check=0/1 [X86,EARLY] Some BIOSes seem to corrupt the first 64k of memory when doing things like suspend/resume. Setting this option will scan the memory @@ -3278,13 +3285,13 @@ affects the same memory, you can use memmap= to prevent the kernel from using that memory. - memory_corruption_check_size=size [X86] + memory_corruption_check_size=size [X86,EARLY] By default it checks for corruption in the low 64k, making this memory unavailable for normal use. Use this parameter to scan for corruption in more or less memory. - memory_corruption_check_period=seconds [X86] + memory_corruption_check_period=seconds [X86,EARLY] By default it checks for corruption every 60 seconds. Use this parameter to check at some other rate. 0 disables periodic checking. @@ -3308,7 +3315,7 @@ Note that even when enabled, there are a few cases where the feature is not effective. - memtest= [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest + memtest= [KNL,X86,ARM,M68K,PPC,RISCV,EARLY] Enable memtest Format: default : 0 Specifies the number of memtest passes to be @@ -3376,7 +3383,7 @@ https://repo.or.cz/w/linux-2.6/mini2440.git mitigations= - [X86,PPC,S390,ARM64] Control optional mitigations for + [X86,PPC,S390,ARM64,EARLY] Control optional mitigations for CPU vulnerabilities. This is a set of curated, arch-independent options, each of which is an aggregation of existing arch-specific options. @@ -3429,7 +3436,7 @@ retbleed=auto,nosmt [X86] mminit_loglevel= - [KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this + [KNL,EARLY] When CONFIG_DEBUG_MEMORY_INIT is set, this parameter allows control of the logging verbosity for the additional memory initialisation checks. A value of 0 disables mminit logging and a level of 4 will @@ -3437,7 +3444,7 @@ so loglevel=8 may also need to be specified. mmio_stale_data= - [X86,INTEL] Control mitigation for the Processor + [X86,INTEL,EARLY] Control mitigation for the Processor MMIO Stale Data vulnerabilities. Processor MMIO Stale Data is a class of @@ -3512,7 +3519,7 @@ mousedev.yres= [MOUSE] Vertical screen resolution, used for devices reporting absolute coordinates, such as tablets - movablecore= [KNL,X86,IA-64,PPC] + movablecore= [KNL,X86,IA-64,PPC,EARLY] Format: nn[KMGTPE] | nn% This parameter is the complement to kernelcore=, it specifies the amount of memory used for migratable @@ -3523,7 +3530,7 @@ that the amount of memory usable for all allocations is not too small. - movable_node [KNL] Boot-time switch to make hotplugable memory + movable_node [KNL,EARLY] Boot-time switch to make hotplugable memory NUMA nodes to be movable. This means that the memory of such nodes will be usable only for movable allocations which rules out almost all kernel @@ -3547,21 +3554,21 @@ [HW] Make the MicroTouch USB driver use raw coordinates ('y', default) or cooked coordinates ('n') - mtrr=debug [X86] + mtrr=debug [X86,EARLY] Enable printing debug information related to MTRR registers at boot time. - mtrr_chunk_size=nn[KMG] [X86] + mtrr_chunk_size=nn[KMG,X86,EARLY] used for mtrr cleanup. It is largest continuous chunk that could hold holes aka. UC entries. - mtrr_gran_size=nn[KMG] [X86] + mtrr_gran_size=nn[KMG,X86,EARLY] Used for mtrr cleanup. It is granularity of mtrr block. Default is 1. Large value could prevent small alignment from using up MTRRs. - mtrr_spare_reg_nr=n [X86] + mtrr_spare_reg_nr=n [X86,EARLY] Format: Range: 0,7 : spare reg number Default : 1 @@ -3747,10 +3754,10 @@ emulation library even if a 387 maths coprocessor is present. - no4lvl [RISCV] Disable 4-level and 5-level paging modes. Forces - kernel to use 3-level paging instead. + no4lvl [RISCV,EARLY] Disable 4-level and 5-level paging modes. + Forces kernel to use 3-level paging instead. - no5lvl [X86-64,RISCV] Disable 5-level paging mode. Forces + no5lvl [X86-64,RISCV,EARLY] Disable 5-level paging mode. Forces kernel to use 4-level paging instead. noaliencache [MM, NUMA, SLAB] Disables the allocation of alien @@ -3759,15 +3766,15 @@ noalign [KNL,ARM] - noaltinstr [S390] Disables alternative instructions patching - (CPU alternatives feature). + noaltinstr [S390,EARLY] Disables alternative instructions + patching (CPU alternatives feature). - noapic [SMP,APIC] Tells the kernel to not make use of any + noapic [SMP,APIC,EARLY] Tells the kernel to not make use of any IOAPICs that may be present in the system. noautogroup Disable scheduler automatic task group creation. - nocache [ARM] + nocache [ARM,EARLY] no_console_suspend [HW] Never suspend the console @@ -3785,13 +3792,13 @@ turn on/off it dynamically. no_debug_objects - [KNL] Disable object debugging + [KNL,EARLY] Disable object debugging nodsp [SH] Disable hardware DSP at boot time. - noefi Disable EFI runtime services support. + noefi [EFI,EARLY] Disable EFI runtime services support. - no_entry_flush [PPC] Don't flush the L1-D cache when entering the kernel. + no_entry_flush [PPC,EARLY] Don't flush the L1-D cache when entering the kernel. noexec [IA-64] @@ -3822,6 +3829,7 @@ real-time systems. no_hash_pointers + [KNL,EARLY] Force pointers printed to the console or buffers to be unhashed. By default, when a pointer is printed via %p format string, that pointer is "hashed", i.e. obscured @@ -3846,9 +3854,9 @@ the impact of the sleep instructions. This is also useful when using JTAG debugger. - nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings. + nohugeiomap [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge I/O mappings. - nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings. + nohugevmalloc [KNL,X86,PPC,ARM64,EARLY] Disable kernel huge vmalloc mappings. nohz= [KNL] Boottime enable/disable dynamic ticks Valid arguments: on, off @@ -3870,13 +3878,13 @@ noinitrd [RAM] Tells the kernel not to load any configured initial RAM disk. - nointremap [X86-64, Intel-IOMMU] Do not enable interrupt + nointremap [X86-64,Intel-IOMMU,EARLY] Do not enable interrupt remapping. [Deprecated - use intremap=off] nointroute [IA-64] - noinvpcid [X86] Disable the INVPCID cpu feature. + noinvpcid [X86,EARLY] Disable the INVPCID cpu feature. noiotrap [SH] Disables trapped I/O port accesses. @@ -3887,19 +3895,19 @@ nojitter [IA-64] Disables jitter checking for ITC timers. - nokaslr [KNL] + nokaslr [KNL,EARLY] When CONFIG_RANDOMIZE_BASE is set, this disables kernel and module base offset ASLR (Address Space Layout Randomization). - no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page + no-kvmapf [X86,KVM,EARLY] Disable paravirtualized asynchronous page fault handling. - no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver + no-kvmclock [X86,KVM,EARLY] Disable paravirtualized KVM clock driver - nolapic [X86-32,APIC] Do not enable or use the local APIC. + nolapic [X86-32,APIC,EARLY] Do not enable or use the local APIC. - nolapic_timer [X86-32,APIC] Do not use the local APIC timer. + nolapic_timer [X86-32,APIC,EARLY] Do not use the local APIC timer. nomca [IA-64] Disable machine check abort handling @@ -3924,23 +3932,23 @@ shutdown the other cpus. Instead use the REBOOT_VECTOR irq. - nopat [X86] Disable PAT (page attribute table extension of + nopat [X86,EARLY] Disable PAT (page attribute table extension of pagetables) support. - nopcid [X86-64] Disable the PCID cpu feature. + nopcid [X86-64,EARLY] Disable the PCID cpu feature. nopku [X86] Disable Memory Protection Keys CPU feature found in some Intel CPUs. - nopti [X86-64] + nopti [X86-64,EARLY] Equivalent to pti=off - nopv= [X86,XEN,KVM,HYPER_V,VMWARE] + nopv= [X86,XEN,KVM,HYPER_V,VMWARE,EARLY] Disables the PV optimizations forcing the guest to run as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. - nopvspin [X86,XEN,KVM] + nopvspin [X86,XEN,KVM,EARLY] Disables the qspinlock slow path using PV optimizations which allow the hypervisor to 'idle' the guest on lock contention. @@ -3960,20 +3968,20 @@ This is required for the Braillex ib80-piezo Braille reader made by F.H. Papenmeier (Germany). - nosgx [X86-64,SGX] Disables Intel SGX kernel support. + nosgx [X86-64,SGX,EARLY] Disables Intel SGX kernel support. - nosmap [PPC] + nosmap [PPC,EARLY] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [PPC64s] + nosmep [PPC64s,EARLY] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. - nosmp [SMP] Tells an SMP kernel to act as a UP kernel, + nosmp [SMP,EARLY] Tells an SMP kernel to act as a UP kernel, and disable the IO APIC. legacy for "maxcpus=0". - nosmt [KNL,MIPS,PPC,S390] Disable symmetric multithreading (SMT). + nosmt [KNL,MIPS,PPC,S390,EARLY] Disable symmetric multithreading (SMT). Equivalent to smt=1. [KNL,X86,PPC] Disable symmetric multithreading (SMT). @@ -3983,22 +3991,23 @@ nosoftlockup [KNL] Disable the soft-lockup detector. nospec_store_bypass_disable - [HW] Disable all mitigations for the Speculative Store Bypass vulnerability + [HW,EARLY] Disable all mitigations for the Speculative + Store Bypass vulnerability - nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch + nospectre_bhb [ARM64,EARLY] Disable all mitigations for Spectre-BHB (branch history injection) vulnerability. System may allow data leaks with this option. - nospectre_v1 [X86,PPC] Disable mitigations for Spectre Variant 1 + nospectre_v1 [X86,PPC,EARLY] Disable mitigations for Spectre Variant 1 (bounds check bypass). With this option data leaks are possible in the system. - nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for - the Spectre variant 2 (indirect branch prediction) - vulnerability. System may allow data leaks with this - option. + nospectre_v2 [X86,PPC_E500,ARM64,EARLY] Disable all mitigations + for the Spectre variant 2 (indirect branch + prediction) vulnerability. System may allow data + leaks with this option. - no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV] Disable + no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable paravirtualized steal time accounting. steal time is computed, but won't influence scheduler behaviour @@ -4008,7 +4017,7 @@ broken timer IRQ sources. no_uaccess_flush - [PPC] Don't flush the L1-D cache after accessing user data. + [PPC,EARLY] Don't flush the L1-D cache after accessing user data. novmcoredd [KNL,KDUMP] Disable device dump. Device dump allows drivers to @@ -4022,15 +4031,15 @@ is set. no-vmw-sched-clock - [X86,PV_OPS] Disable paravirtualized VMware scheduler - clock and use the default one. + [X86,PV_OPS,EARLY] Disable paravirtualized VMware + scheduler clock and use the default one. nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). - nowb [ARM] + nowb [ARM,EARLY] - nox2apic [X86-64,APIC] Do not enable x2APIC mode. + nox2apic [X86-64,APIC,EARLY] Do not enable x2APIC mode. NOTE: this parameter will be ignored on systems with the LEGACY_XAPIC_DISABLED bit set in the @@ -4068,7 +4077,7 @@ purges which is reported from either PAL_VM_SUMMARY or SAL PALO. - nr_cpus= [SMP] Maximum number of processors that an SMP kernel + nr_cpus= [SMP,EARLY] Maximum number of processors that an SMP kernel could support. nr_cpus=n : n >= 1 limits the kernel to support 'n' processors. It could be larger than the number of already plugged CPU during bootup, later in @@ -4079,8 +4088,9 @@ nr_uarts= [SERIAL] maximum number of UARTs to be registered. - numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86] Disable NUMA, Only - set up a single NUMA node spanning all memory. + numa=off [KNL, ARM64, PPC, RISCV, SPARC, X86, EARLY] + Disable NUMA, Only set up a single NUMA node + spanning all memory. numa_balancing= [KNL,ARM64,PPC,RISCV,S390,X86] Enable or disable automatic NUMA balancing. @@ -4091,7 +4101,7 @@ This can be set from sysctl after boot. See Documentation/admin-guide/sysctl/vm.rst for details. - ohci1394_dma=early [HW] enable debugging via the ohci1394 driver. + ohci1394_dma=early [HW,EARLY] enable debugging via the ohci1394 driver. See Documentation/core-api/debugging-via-ohci1394.rst for more info. @@ -4117,7 +4127,8 @@ Once locked, the boundary cannot be changed. 1 indicates lock status, 0 indicates unlock status. - oops=panic Always panic on oopses. Default is to just kill the + oops=panic [KNL,EARLY] + Always panic on oopses. Default is to just kill the process, but there is a small probability of deadlocking the machine. This will also cause panics on machine check exceptions. @@ -4133,13 +4144,13 @@ can be read from sysfs at: /sys/module/page_alloc/parameters/shuffle. - page_owner= [KNL] Boot-time page_owner enabling option. + page_owner= [KNL,EARLY] Boot-time page_owner enabling option. Storage of the information about who allocated each page is disabled in default. With this switch, we can turn it on. on: enable the feature - page_poison= [KNL] Boot-time parameter changing the state of + page_poison= [KNL,EARLY] Boot-time parameter changing the state of poisoning on the buddy allocator, available with CONFIG_PAGE_POISONING=y. off: turn off poisoning (default) @@ -4157,7 +4168,8 @@ timeout < 0: reboot immediately Format: - panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + panic_on_taint= [KNL,EARLY] + Bitmask for conditionally calling panic() in add_taint() Format: [,nousertaint] Hexadecimal bitmask representing the set of TAINT flags that will cause the kernel to panic when add_taint() is @@ -4313,7 +4325,7 @@ pcbit= [HW,ISDN] - pci=option[,option...] [PCI] various PCI subsystem options. + pci=option[,option...] [PCI,EARLY] various PCI subsystem options. Some options herein operate on a specific device or a set of devices (). These are @@ -4582,7 +4594,8 @@ Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c - percpu_alloc= Select which percpu first chunk allocator to use. + percpu_alloc= [MM,EARLY] + Select which percpu first chunk allocator to use. Currently supported values are "embed" and "page". Archs may support subset or none of the selections. See comments in mm/percpu.c for details on each @@ -4651,12 +4664,12 @@ execution priority. ppc_strict_facility_enable - [PPC] This option catches any kernel floating point, + [PPC,ENABLE] This option catches any kernel floating point, Altivec, VSX and SPE outside of regions specifically allowed (eg kernel_enable_fpu()/kernel_disable_fpu()). There is some performance impact when enabling this. - ppc_tm= [PPC] + ppc_tm= [PPC,EARLY] Format: {"off"} Disable Hardware Transactional Memory @@ -4766,7 +4779,7 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number. - quiet [KNL] Disable most log messages + quiet [KNL,EARLY] Disable most log messages r128= [HW,DRM] @@ -4783,17 +4796,17 @@ ramdisk_start= [RAM] RAM disk image start address random.trust_cpu=off - [KNL] Disable trusting the use of the CPU's + [KNL,EARLY] Disable trusting the use of the CPU's random number generator (if available) to initialize the kernel's RNG. random.trust_bootloader=off - [KNL] Disable trusting the use of the a seed + [KNL,EARLY] Disable trusting the use of the a seed passed by the bootloader (if available) to initialize the kernel's RNG. randomize_kstack_offset= - [KNL] Enable or disable kernel stack offset + [KNL,EARLY] Enable or disable kernel stack offset randomization, which provides roughly 5 bits of entropy, frustrating memory corruption attacks that depend on stack address determinism or @@ -5484,7 +5497,7 @@ Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. - rdrand= [X86] + rdrand= [X86,EARLY] force - Override the decision by the kernel to hide the advertisement of RDRAND support (this affects certain AMD processors because of buggy BIOS @@ -5580,7 +5593,7 @@ them. If is less than 0x10000, the region is assumed to be I/O ports; otherwise it is memory. - reservetop= [X86-32] + reservetop= [X86-32,EARLY] Format: nn[KMG] Reserves a hole at the top of the kernel virtual address space. @@ -5665,7 +5678,7 @@ [KNL] Disable ring 3 MONITOR/MWAIT feature on supported CPUs. - riscv_isa_fallback [RISCV] + riscv_isa_fallback [RISCV,EARLY] When CONFIG_RISCV_ISA_FALLBACK is not enabled, permit falling back to detecting extension support by parsing "riscv,isa" property on devicetree systems when the @@ -5674,13 +5687,14 @@ ro [KNL] Mount root device read-only on boot - rodata= [KNL] + rodata= [KNL,EARLY] on Mark read-only kernel memory as read-only (default). off Leave read-only kernel memory writable for debugging. full Mark read-only kernel memory and aliases as read-only [arm64] rockchip.usb_uart + [EARLY] Enable the uart passthrough on the designated usb port on Rockchip SoCs. When active, the signals of the debug-uart get routed to the D+ and D- pins of the usb @@ -5741,7 +5755,7 @@ sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. - sched_verbose [KNL] Enables verbose scheduler debug messages. + sched_verbose [KNL,EARLY] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature @@ -5856,7 +5870,7 @@ non-zero "wait" parameter. See weight_single and weight_many. - skew_tick= [KNL] Offset the periodic timer tick per cpu to mitigate + skew_tick= [KNL,EARLY] Offset the periodic timer tick per cpu to mitigate xtime_lock contention on larger systems, and/or RCU lock contention on all systems with CONFIG_MAXSMP set. Format: { "0" | "1" } @@ -5987,10 +6001,10 @@ 1: Fast pin select (default) 2: ATC IRMode - smt= [KNL,MIPS,S390] Set the maximum number of threads (logical - CPUs) to use per physical CPU on systems capable of - symmetric multithreading (SMT). Will be capped to the - actual hardware limit. + smt= [KNL,MIPS,S390,EARLY] Set the maximum number of threads + (logical CPUs) to use per physical CPU on systems + capable of symmetric multithreading (SMT). Will + be capped to the actual hardware limit. Format: Default: -1 (no limit) @@ -6012,7 +6026,7 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst - spectre_v2= [X86] Control mitigation of Spectre variant 2 + spectre_v2= [X86,EARLY] Control mitigation of Spectre variant 2 (indirect branch speculation) vulnerability. The default operation protects the kernel from user space attacks. @@ -6092,7 +6106,7 @@ spectre_v2_user=auto. spec_rstack_overflow= - [X86] Control RAS overflow mitigation on AMD Zen CPUs + [X86,EARLY] Control RAS overflow mitigation on AMD Zen CPUs off - Disable mitigation microcode - Enable microcode mitigation only @@ -6103,7 +6117,7 @@ (cloud-specific mitigation) spec_store_bypass_disable= - [HW] Control Speculative Store Bypass (SSB) Disable mitigation + [HW,EARLY] Control Speculative Store Bypass (SSB) Disable mitigation (Speculative Store Bypass vulnerability) Certain CPUs are vulnerable to an exploit against a @@ -6199,7 +6213,7 @@ #DB exception for bus lock is triggered only when CPL > 0. - srbds= [X86,INTEL] + srbds= [X86,INTEL,EARLY] Control the Special Register Buffer Data Sampling (SRBDS) mitigation. @@ -6286,7 +6300,7 @@ srcutree.convert_to_big must have the 0x10 bit set for contention-based conversions to occur. - ssbd= [ARM64,HW] + ssbd= [ARM64,HW,EARLY] Speculative Store Bypass Disable control On CPUs that are vulnerable to the Speculative @@ -6310,7 +6324,7 @@ growing up) the main stack are reserved for no other mapping. Default value is 256 pages. - stack_depot_disable= [KNL] + stack_depot_disable= [KNL,EARLY] Setting this to true through kernel command line will disable the stack depot thereby saving the static memory consumed by the stack hash table. By default this is set @@ -6349,12 +6363,12 @@ be used to filter out binaries which have not yet been made aware of AT_MINSIGSTKSZ. - stress_hpt [PPC] + stress_hpt [PPC,EARLY] Limits the number of kernel HPT entries in the hash page table to increase the rate of hash page table faults on kernel addresses. - stress_slb [PPC] + stress_slb [PPC,EARLY] Limits the number of kernel SLB entries, and flushes them frequently to increase the rate of SLB faults on kernel addresses. @@ -6414,7 +6428,7 @@ This parameter controls use of the Protected Execution Facility on pSeries. - swiotlb= [ARM,IA-64,PPC,MIPS,X86] + swiotlb= [ARM,IA-64,PPC,MIPS,X86,EARLY] Format: { [,] | force | noforce } -- Number of I/O TLB slabs -- Second integer after comma. Number of swiotlb @@ -6424,7 +6438,7 @@ wouldn't be automatically used by the kernel noforce -- Never use bounce buffers (for debugging) - switches= [HW,M68k] + switches= [HW,M68k,EARLY] sysctl.*= [KNL] Set a sysctl parameter, right before loading the init @@ -6483,11 +6497,11 @@ : poll all this frequency 0: no polling (default) - threadirqs [KNL] + threadirqs [KNL,EARLY] Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. - topology= [S390] + topology= [S390,EARLY] Format: {off | on} Specify if the kernel should make use of the cpu topology information if the hardware supports this. @@ -6728,7 +6742,7 @@ can be overridden by a later tsc=nowatchdog. A console message will flag any such suppression or overriding. - tsc_early_khz= [X86] Skip early TSC calibration and use the given + tsc_early_khz= [X86,EARLY] Skip early TSC calibration and use the given value instead. Useful when the early TSC frequency discovery procedure is not reliable, such as on overclocked systems with CPUID.16h support and partial CPUID.15h support. @@ -6763,7 +6777,7 @@ See Documentation/admin-guide/hw-vuln/tsx_async_abort.rst for more details. - tsx_async_abort= [X86,INTEL] Control mitigation for the TSX Async + tsx_async_abort= [X86,INTEL,EARLY] Control mitigation for the TSX Async Abort (TAA) vulnerability. Similar to Micro-architectural Data Sampling (MDS) @@ -6829,7 +6843,7 @@ unknown_nmi_panic [X86] Cause panic on unknown NMI. - unwind_debug [X86-64] + unwind_debug [X86-64,EARLY] Enable unwinder debug output. This can be useful for debugging certain unwinder error conditions, including corrupt stacks and @@ -7019,7 +7033,7 @@ Example: user_debug=31 userpte= - [X86] Flags controlling user PTE allocations. + [X86,EARLY] Flags controlling user PTE allocations. nohigh = do not allocate PTE pages in HIGHMEM regardless of setting @@ -7048,7 +7062,7 @@ vector= [IA-64,SMP] vector=percpu: enable percpu vector domain - video= [FB] Frame buffer configuration + video= [FB,EARLY] Frame buffer configuration See Documentation/fb/modedb.rst. video.brightness_switch_enabled= [ACPI] @@ -7096,13 +7110,13 @@ P Enable page structure init time poisoning - Disable all of the above options - vmalloc=nn[KMG] [KNL,BOOT] Forces the vmalloc area to have an exact - size of . This can be used to increase the - minimum size (128MB on x86). It can also be used to - decrease the size and leave more room for directly - mapped kernel RAM. + vmalloc=nn[KMG] [KNL,BOOT,EARLY] Forces the vmalloc area to have an + exact size of . This can be used to increase + the minimum size (128MB on x86). It can also be + used to decrease the size and leave more room + for directly mapped kernel RAM. - vmcp_cma=nn[MG] [KNL,S390] + vmcp_cma=nn[MG] [KNL,S390,EARLY] Sets the memory size reserved for contiguous memory allocations for the vmcp device driver. @@ -7115,7 +7129,7 @@ vmpoff= [KNL,S390] Perform z/VM CP command after power off. Format: - vsyscall= [X86-64] + vsyscall= [X86-64,EARLY] Controls the behavior of vsyscalls (i.e. calls to fixed addresses of 0xffffffffff600x00 from legacy code). Most statically-linked binaries and older @@ -7263,13 +7277,13 @@ When enabled, memory and cache locality will be impacted. - writecombine= [LOONGARCH] Control the MAT (Memory Access Type) of - ioremap_wc(). + writecombine= [LOONGARCH,EARLY] Control the MAT (Memory Access + Type) of ioremap_wc(). on - Enable writecombine, use WUC for ioremap_wc() off - Disable writecombine, use SUC for ioremap_wc() - x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of + x2apic_phys [X86-64,APIC,EARLY] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. @@ -7280,7 +7294,7 @@ save/restore/migration must be enabled to handle larger domains. - xen_emul_unplug= [HW,X86,XEN] + xen_emul_unplug= [HW,X86,XEN,EARLY] Unplug Xen emulated devices Format: [unplug0,][unplug1] ide-disks -- unplug primary master IDE devices @@ -7292,17 +7306,17 @@ the unplug protocol never -- do not unplug even if version check succeeds - xen_legacy_crash [X86,XEN] + xen_legacy_crash [X86,XEN,EARLY] Crash from Xen panic notifier, without executing late panic() code such as dumping handler. - xen_msr_safe= [X86,XEN] + xen_msr_safe= [X86,XEN,EARLY] Format: Select whether to always use non-faulting (safe) MSR access functions when running as Xen PV guest. The default value is controlled by CONFIG_XEN_PV_MSR_SAFE. - xen_nopvspin [X86,XEN] + xen_nopvspin [X86,XEN,EARLY] Disables the qspinlock slowpath using Xen PV optimizations. This parameter is obsoleted by "nopvspin" parameter, which has equivalent effect for XEN platform. @@ -7314,7 +7328,7 @@ has equivalent effect for XEN platform. xen_no_vector_callback - [KNL,X86,XEN] Disable the vector callback for Xen + [KNL,X86,XEN,EARLY] Disable the vector callback for Xen event channel interrupts. xen_scrub_pages= [XEN] @@ -7323,7 +7337,7 @@ with /sys/devices/system/xen_memory/xen_memory0/scrub_pages. Default value controlled with CONFIG_XEN_SCRUB_PAGES_DEFAULT. - xen_timer_slop= [X86-64,XEN] + xen_timer_slop= [X86-64,XEN,EARLY] Set the timer slop (in nanoseconds) for the virtual Xen timers (default is 100000). This adjusts the minimum delta of virtualized Xen timers, where lower values @@ -7376,7 +7390,7 @@ host controller quirks. Meaning of each bit can be consulted in header drivers/usb/host/xhci.h. - xmon [PPC] + xmon [PPC,EARLY] Format: { early | on | rw | ro | off } Controls if xmon debugger is enabled. Default is off. Passing only "xmon" is equivalent to "xmon=early". From 3b239b308e94ce6c65f6646d251edb737b82e716 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Dec 2023 20:34:58 -0800 Subject: [PATCH 20/34] context_tracking: Fix kerneldoc headers for __ct_user_{enter,exit}() Document the "state" parameter of both of these functions. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312041922.YZCcEPYD-lkp@intel.com/ Signed-off-by: Paul E. McKenney Tested-by: Randy Dunlap Acked-by: Randy Dunlap Cc: Frederic Weisbecker Signed-off-by: Boqun Feng --- kernel/context_tracking.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6ef0b35fc28c..70ae70d03823 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -458,6 +458,8 @@ static __always_inline void context_tracking_recursion_exit(void) * __ct_user_enter - Inform the context tracking that the CPU is going * to enter user or guest space mode. * + * @state: userspace context-tracking state to enter. + * * This function must be called right before we switch from the kernel * to user or guest space, when it's guaranteed the remaining kernel * instructions to execute won't use any RCU read side critical section @@ -595,6 +597,8 @@ NOKPROBE_SYMBOL(user_enter_callable); * __ct_user_exit - Inform the context tracking that the CPU is * exiting user or guest mode and entering the kernel. * + * @state: userspace context-tracking state being exited from. + * * This function must be called after we entered the kernel from user or * guest space before any use of RCU read side critical section. This * potentially include any high level kernel code like syscalls, exceptions, From 56823e9f60f0eedb9981f28b664232a9cace1015 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 11 Dec 2023 11:55:17 -0800 Subject: [PATCH 21/34] doc: Clarify use of slab constructors and SLAB_TYPESAFE_BY_RCU This commit explicitly states that you should initialize any locks to be used by readers in your SLAB_TYPESAFE_BY_RCU constructor. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/whatisRCU.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 246ce0d0b4d1..872ac665223f 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -963,8 +963,8 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be initialized after each and every call to kmem_cache_alloc(), which renders reference-free spinlock acquisition completely unsafe. Therefore, when using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter. -(Those willing to use a kmem_cache constructor may also use locking, -including cache-friendly sequence locking.) +(Those willing to initialize their locks in a kmem_cache constructor +may also use locking, including cache-friendly sequence locking.) With traditional reference counting -- such as that implemented by the kref library in Linux -- there is typically code that runs when the last From e15aed426a1bf5ba98e5a3989a7d41f2b2ee96d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Dec 2023 09:49:20 -0800 Subject: [PATCH 22/34] doc: Update checklist.rst discussion of callback execution This commit completes the list of call_rcu*() functions that are not guaranteed to have their callbacks executing on the same CPU. While in the area, fix an unrelated typo. Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/RCU/checklist.rst | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index addd5c1547a4..3e6407de231c 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -383,16 +383,17 @@ over a rather long period of time, but improvements are always welcome! must use whatever locking or other synchronization is required to safely access and/or modify that data structure. - Do not assume that RCU callbacks will be executed on the same - CPU that executed the corresponding call_rcu() or call_srcu(). - For example, if a given CPU goes offline while having an RCU - callback pending, then that RCU callback will execute on some - surviving CPU. (If this was not the case, a self-spawning RCU - callback would prevent the victim CPU from ever going offline.) - Furthermore, CPUs designated by rcu_nocbs= might well *always* - have their RCU callbacks executed on some other CPUs, in fact, - for some real-time workloads, this is the whole point of using - the rcu_nocbs= kernel boot parameter. + Do not assume that RCU callbacks will be executed on + the same CPU that executed the corresponding call_rcu(), + call_srcu(), call_rcu_tasks(), call_rcu_tasks_rude(), or + call_rcu_tasks_trace(). For example, if a given CPU goes offline + while having an RCU callback pending, then that RCU callback + will execute on some surviving CPU. (If this was not the case, + a self-spawning RCU callback would prevent the victim CPU from + ever going offline.) Furthermore, CPUs designated by rcu_nocbs= + might well *always* have their RCU callbacks executed on some + other CPUs, in fact, for some real-time workloads, this is the + whole point of using the rcu_nocbs= kernel boot parameter. In addition, do not assume that callbacks queued in a given order will be invoked in that order, even if they all are queued on the From 499d7e7e83d25fcf0fa1a8c0be6857a84cbf6a4a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Nov 2023 14:11:26 -0500 Subject: [PATCH 23/34] rcu: Rename jiffies_till_flush to jiffies_lazy_flush The variable name jiffies_till_flush is too generic and therefore: * It may shadow a global variable * It doesn't tell on what it operates Make the name more precise, along with the related APIs. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 8 ++++---- kernel/rcu/rcuscale.c | 6 +++--- kernel/rcu/tree_nocb.h | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f94f65877f2b..dcfb666f2499 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -543,11 +543,11 @@ enum rcutorture_type { }; #if defined(CONFIG_RCU_LAZY) -unsigned long rcu_lazy_get_jiffies_till_flush(void); -void rcu_lazy_set_jiffies_till_flush(unsigned long j); +unsigned long rcu_get_jiffies_lazy_flush(void); +void rcu_set_jiffies_lazy_flush(unsigned long j); #else -static inline unsigned long rcu_lazy_get_jiffies_till_flush(void) { return 0; } -static inline void rcu_lazy_set_jiffies_till_flush(unsigned long j) { } +static inline unsigned long rcu_get_jiffies_lazy_flush(void) { return 0; } +static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { } #endif #if defined(CONFIG_TREE_RCU) diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index ffdb30495e3c..8db4fedaaa1e 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -764,9 +764,9 @@ kfree_scale_init(void) if (kfree_by_call_rcu) { /* do a test to check the timeout. */ - orig_jif = rcu_lazy_get_jiffies_till_flush(); + orig_jif = rcu_get_jiffies_lazy_flush(); - rcu_lazy_set_jiffies_till_flush(2 * HZ); + rcu_set_jiffies_lazy_flush(2 * HZ); rcu_barrier(); jif_start = jiffies; @@ -775,7 +775,7 @@ kfree_scale_init(void) smp_cond_load_relaxed(&rcu_lazy_test1_cb_called, VAL == 1); - rcu_lazy_set_jiffies_till_flush(orig_jif); + rcu_set_jiffies_lazy_flush(orig_jif); if (WARN_ON_ONCE(jiffies_at_lazy_cb - jif_start < 2 * HZ)) { pr_alert("ERROR: call_rcu() CBs are not being lazy as expected!\n"); diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 4efbf7333d4e..aecef51166c7 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -256,6 +256,7 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) return __wake_nocb_gp(rdp_gp, rdp, force, flags); } +#ifdef CONFIG_RCU_LAZY /* * LAZY_FLUSH_JIFFIES decides the maximum amount of time that * can elapse before lazy callbacks are flushed. Lazy callbacks @@ -264,21 +265,20 @@ static bool wake_nocb_gp(struct rcu_data *rdp, bool force) * left unsubmitted to RCU after those many jiffies. */ #define LAZY_FLUSH_JIFFIES (10 * HZ) -static unsigned long jiffies_till_flush = LAZY_FLUSH_JIFFIES; +static unsigned long jiffies_lazy_flush = LAZY_FLUSH_JIFFIES; -#ifdef CONFIG_RCU_LAZY // To be called only from test code. -void rcu_lazy_set_jiffies_till_flush(unsigned long jif) +void rcu_set_jiffies_lazy_flush(unsigned long jif) { - jiffies_till_flush = jif; + jiffies_lazy_flush = jif; } -EXPORT_SYMBOL(rcu_lazy_set_jiffies_till_flush); +EXPORT_SYMBOL(rcu_set_jiffies_lazy_flush); -unsigned long rcu_lazy_get_jiffies_till_flush(void) +unsigned long rcu_get_jiffies_lazy_flush(void) { - return jiffies_till_flush; + return jiffies_lazy_flush; } -EXPORT_SYMBOL(rcu_lazy_get_jiffies_till_flush); +EXPORT_SYMBOL(rcu_get_jiffies_lazy_flush); #endif /* @@ -299,7 +299,7 @@ static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, */ if (waketype == RCU_NOCB_WAKE_LAZY && rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) { - mod_timer(&rdp_gp->nocb_timer, jiffies + jiffies_till_flush); + mod_timer(&rdp_gp->nocb_timer, jiffies + rcu_get_jiffies_lazy_flush()); WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype); } else if (waketype == RCU_NOCB_WAKE_BYPASS) { mod_timer(&rdp_gp->nocb_timer, jiffies + 2); @@ -482,7 +482,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, // flush ->nocb_bypass to ->cblist. if ((ncbs && !bypass_is_lazy && j != READ_ONCE(rdp->nocb_bypass_first)) || (ncbs && bypass_is_lazy && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush))) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()))) || ncbs >= qhimark) { rcu_nocb_lock(rdp); *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist); @@ -723,7 +723,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) lazy_ncbs = READ_ONCE(rdp->lazy_len); if (bypass_ncbs && (lazy_ncbs == bypass_ncbs) && - (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + jiffies_till_flush) || + (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + rcu_get_jiffies_lazy_flush()) || bypass_ncbs > 2 * qhimark)) { flush_bypass = true; } else if (bypass_ncbs && (lazy_ncbs != bypass_ncbs) && From 7f66f099de4dc4b1a66a3f94e6db16409924a6f8 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Sun, 3 Dec 2023 01:12:52 +0000 Subject: [PATCH 24/34] rcu: Provide a boot time parameter to control lazy RCU To allow more flexible arrangements while still provide a single kernel for distros, provide a boot time parameter to enable/disable lazy RCU. Specify: rcutree.enable_rcu_lazy=[y|1|n|0] Which also requires rcu_nocbs=all at boot time to enable/disable lazy RCU. To disable it by default at build time when CONFIG_RCU_LAZY=y, the new CONFIG_RCU_LAZY_DEFAULT_OFF can be used. Signed-off-by: Qais Yousef (Google) Tested-by: Andrea Righi Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- Documentation/admin-guide/kernel-parameters.txt | 5 +++++ kernel/rcu/Kconfig | 13 +++++++++++++ kernel/rcu/tree.c | 7 ++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 31b3a25680d0..b6c848c29a53 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5034,6 +5034,11 @@ this kernel boot parameter, forcibly setting it to zero. + rcutree.enable_rcu_lazy= [KNL] + To save power, batch RCU callbacks and flush after + delay, memory pressure or callback list growing too + big. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index bdd7eadb33d8..e7d2dd267593 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -314,6 +314,19 @@ config RCU_LAZY To save power, batch RCU callbacks and flush after delay, memory pressure, or callback list growing too big. + Requires rcu_nocbs=all to be set. + + Use rcutree.enable_rcu_lazy=0 to turn it off at boot time. + +config RCU_LAZY_DEFAULT_OFF + bool "Turn RCU lazy invocation off by default" + depends on RCU_LAZY + default n + help + Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default + off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch + it back on. + config RCU_DOUBLE_CHECK_CB_TIME bool "RCU callback-batch backup time check" depends on RCU_EXPERT diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2bccfd37c38..41c50a6c607e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2753,6 +2753,9 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) } #ifdef CONFIG_RCU_LAZY +static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(CONFIG_RCU_LAZY_DEFAULT_OFF); +module_param(enable_rcu_lazy, bool, 0444); + /** * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and * flush all lazy callbacks (including the new one) to the main ->cblist while @@ -2778,6 +2781,8 @@ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) __call_rcu_common(head, func, false); } EXPORT_SYMBOL_GPL(call_rcu_hurry); +#else +#define enable_rcu_lazy false #endif /** @@ -2826,7 +2831,7 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu_common(head, func, IS_ENABLED(CONFIG_RCU_LAZY)); + __call_rcu_common(head, func, enable_rcu_lazy); } EXPORT_SYMBOL_GPL(call_rcu); From 67050837ec14fc20a26b237ce965c50c85a318b7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 27 Dec 2023 12:47:38 -0500 Subject: [PATCH 25/34] srcu: Improve comments about acceleration leak The comments added in commit 1ef990c4b36b ("srcu: No need to advance/accelerate if no callback enqueued") are a bit confusing. The comments are describing a scenario for code that was moved and is no longer the way it was (snapshot after advancing). Improve the code comments to reflect this and also document why acceleration can never fail. Cc: Frederic Weisbecker Cc: Neeraj Upadhyay Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/srcutree.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0351a4e83529..e4d673fc30f4 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1234,11 +1234,20 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); /* - * The snapshot for acceleration must be taken _before_ the read of the - * current gp sequence used for advancing, otherwise advancing may fail - * and acceleration may then fail too. + * It's crucial to capture the snapshot 's' for acceleration before + * reading the current gp_seq that is used for advancing. This is + * essential because if the acceleration snapshot is taken after a + * failed advancement attempt, there's a risk that a grace period may + * conclude and a new one may start in the interim. If the snapshot is + * captured after this sequence of events, the acceleration snapshot 's' + * could be excessively advanced, leading to acceleration failure. + * In such a scenario, an 'acceleration leak' can occur, where new + * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment. + * Also note that encountering advancing failures is a normal + * occurrence when the grace period for RCU_WAIT_TAIL is in progress. * - * This could happen if: + * To see this, consider the following events which occur if + * rcu_seq_snap() were to be called after advance: * * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). @@ -1264,6 +1273,13 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, if (rhp) { rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); + /* + * Acceleration can never fail because the base current gp_seq + * used for acceleration is <= the value of gp_seq used for + * advancing. This means that RCU_NEXT_TAIL segment will + * always be able to be emptied by the acceleration into the + * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments. + */ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); } if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { From fd2a749d3f4f7ff0129af1a2c2685faca407ea54 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Jan 2024 10:59:25 -0800 Subject: [PATCH 26/34] rcutorture: Suppress rtort_pipe_count warnings until after stalls Currently, if rcu_torture_writer() sees fewer than ten grace periods having elapsed during a call to stutter_wait() that actually waited, the rtort_pipe_count warning is emitted. This has worked well for a long time. Except that the rcutorture TREE07 scenario now does a short-term 14-second RCU CPU stall, which can most definitely case false-positive rtort_pipe_count warnings. This commit therefore changes rcu_torture_writer() to compute the full expected holdoff and stall duration, and to refuse to report any rtort_pipe_count warnings until after all stalls have completed. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Signed-off-by: Boqun Feng --- kernel/rcu/rcutorture.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 7567ca8e743c..45d6b4c3d199 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1368,9 +1368,13 @@ rcu_torture_writer(void *arg) struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + unsigned long stallsdone = jiffies; bool stutter_waited; unsigned long ulo[NUM_ACTIVE_RCU_POLL_OLDSTATE]; + // If a new stall test is added, this must be adjusted. + if (stall_cpu_holdoff + stall_gp_kthread + stall_cpu) + stallsdone += (stall_cpu_holdoff + stall_gp_kthread + stall_cpu + 60) * HZ; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); if (!can_expedite) pr_alert("%s" TORTURE_FLAG @@ -1576,11 +1580,11 @@ rcu_torture_writer(void *arg) !atomic_read(&rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - boot_ended) + boot_ended && + time_after(jiffies, stallsdone)) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && - rcu_access_pointer(rcu_torture_current) != - &rcu_tortures[i]) { + rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); show_rcu_gp_kthreads(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); @@ -2441,7 +2445,8 @@ static struct notifier_block rcu_torture_stall_block = { /* * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then - * induces a CPU stall for the time specified by stall_cpu. + * induces a CPU stall for the time specified by stall_cpu. If a new + * stall test is added, stallsdone in rcu_torture_writer() must be adjusted. */ static int rcu_torture_stall(void *args) { From c90e3ecc91584558d24c82940a3651fdfc174be0 Mon Sep 17 00:00:00 2001 From: Onkarnath Date: Thu, 11 Jan 2024 14:57:22 +0530 Subject: [PATCH 27/34] rcu/sync: remove un-used rcu_sync_enter_start function With commit '6a010a49b63a ("cgroup: Make !percpu threadgroup_rwsem operations optional")' usage of rcu_sync_enter_start is removed. So this function can also be removed. In the words of Oleg Nesterov: __rcu_sync_enter(wait => false) is a better alternative if someone needs rcu_sync_enter_start() again. Link: https://lore.kernel.org/all/20220725121208.GB28662@redhat.com/ Signed-off-by: Onkarnath Signed-off-by: Maninder Singh Acked-by: Oleg Nesterov Acked-by: Tejun Heo Reviewed-by: Paul E. McKenney Signed-off-by: Boqun Feng --- include/linux/rcu_sync.h | 1 - kernel/rcu/sync.c | 16 ---------------- 2 files changed, 17 deletions(-) diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h index 0027d4c8087c..3860dbb9107a 100644 --- a/include/linux/rcu_sync.h +++ b/include/linux/rcu_sync.h @@ -37,7 +37,6 @@ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) } extern void rcu_sync_init(struct rcu_sync *); -extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index e550f97779b8..86df878a2fee 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -24,22 +24,6 @@ void rcu_sync_init(struct rcu_sync *rsp) init_waitqueue_head(&rsp->gp_wait); } -/** - * rcu_sync_enter_start - Force readers onto slow path for multiple updates - * @rsp: Pointer to rcu_sync structure to use for synchronization - * - * Must be called after rcu_sync_init() and before first use. - * - * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}() - * pairs turn into NO-OPs. - */ -void rcu_sync_enter_start(struct rcu_sync *rsp) -{ - rsp->gp_count++; - rsp->gp_state = GP_PASSED; -} - - static void rcu_sync_func(struct rcu_head *rhp); static void rcu_sync_call(struct rcu_sync *rsp) From 2eb52fa8900e642b3b5054c4bf9776089d2a935f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 4 Dec 2023 09:33:29 -0800 Subject: [PATCH 28/34] rcu-tasks: Repair RCU Tasks Trace quiescence check The context-switch-time check for RCU Tasks Trace quiescence expects current->trc_reader_special.b.need_qs to be zero, and if so, updates it to TRC_NEED_QS_CHECKED. This is backwards, because if this value is zero, there is no RCU Tasks Trace grace period in flight, an thus no need for a quiescent state. Instead, when a grace period starts, this field is set to TRC_NEED_QS. This commit therefore changes the check from zero to TRC_NEED_QS. Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Tested-by: Steven Rostedt (Google) Signed-off-by: Boqun Feng --- include/linux/rcupdate.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0746b1b0b663..16f519914415 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,9 +184,9 @@ void rcu_tasks_trace_qs_blkd(struct task_struct *t); do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ - if (likely(!READ_ONCE((t)->trc_reader_special.b.need_qs)) && \ + if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ likely(!___rttq_nesting)) { \ - rcu_trc_cmpxchg_need_qs((t), 0, TRC_NEED_QS_CHECKED); \ + rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ From bfe93930ea1ea3c6c115a7d44af6e4fea609067e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Feb 2024 13:08:22 -0800 Subject: [PATCH 29/34] rcu-tasks: Add data to eliminate RCU-tasks/do_exit() deadlocks Holding a mutex across synchronize_rcu_tasks() and acquiring that same mutex in code called from do_exit() after its call to exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() results in deadlock. This is by design, because tasks that are far enough into do_exit() are no longer present on the tasks list, making it a bit difficult for RCU Tasks to find them, let alone wait on them to do a voluntary context switch. However, such deadlocks are becoming more frequent. In addition, lockdep currently does not detect such deadlocks and they can be difficult to reproduce. In addition, if a task voluntarily context switches during that time (for example, if it blocks acquiring a mutex), then this task is in an RCU Tasks quiescent state. And with some adjustments, RCU Tasks could just as well take advantage of that fact. This commit therefore adds the data structures that will be needed to rely on these quiescent states and to eliminate these deadlocks. Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/ Reported-by: Chen Zhongjin Reported-by: Yang Jihong Signed-off-by: Paul E. McKenney Tested-by: Yang Jihong Tested-by: Chen Zhongjin Reviewed-by: Frederic Weisbecker Signed-off-by: Boqun Feng --- include/linux/sched.h | 2 ++ kernel/rcu/tasks.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..5eeebed2dd9b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -858,6 +858,8 @@ struct task_struct { u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; + int rcu_tasks_exit_cpu; + struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 732ad5b39946..b7d5f2757053 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -32,6 +32,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @rtp_irq_work: IRQ work queue for deferred wakeups. * @barrier_q_head: RCU callback for barrier operation. * @rtp_blkd_tasks: List of tasks blocked as readers. + * @rtp_exit_list: List of tasks in the latter portion of do_exit(). * @cpu: CPU number corresponding to this entry. * @rtpp: Pointer to the rcu_tasks structure. */ @@ -46,6 +47,7 @@ struct rcu_tasks_percpu { struct irq_work rtp_irq_work; struct rcu_head barrier_q_head; struct list_head rtp_blkd_tasks; + struct list_head rtp_exit_list; int cpu; struct rcu_tasks *rtpp; }; From 30ef09635b9ed3ebca4f677495332a2e444a5cda Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 12:29:54 -0800 Subject: [PATCH 30/34] rcu-tasks: Initialize callback lists at rcu_init() time In order for RCU Tasks to reliably maintain per-CPU lists of exiting tasks, those lists must be initialized before it is possible for tasks to exit, especially given that the boot CPU is not necessarily CPU 0 (an example being, powerpc kexec() kernels). And at the time that rcu_init_tasks_generic() is called, a task could potentially exit, unconventional though that sort of thing might be. This commit therefore moves the calls to cblist_init_generic() from functions called from rcu_init_tasks_generic() to a new function named tasks_cblist_init_generic() that is invoked from rcu_init(). This constituted a bug in a commit that never went to mainline, so there is no need for any backporting to -stable. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng --- kernel/rcu/rcu.h | 6 ++++++ kernel/rcu/tasks.h | 24 ++++++++++++++++++------ kernel/rcu/tiny.c | 1 + kernel/rcu/tree.c | 2 ++ 4 files changed, 27 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index f94f65877f2b..ef63ea59c8b6 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -528,6 +528,12 @@ struct task_struct *get_rcu_tasks_gp_kthread(void); struct task_struct *get_rcu_tasks_rude_gp_kthread(void); #endif // # ifdef CONFIG_TASKS_RUDE_RCU +#ifdef CONFIG_TASKS_RCU_GENERIC +void tasks_cblist_init_generic(void); +#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +static inline void tasks_cblist_init_generic(void) { } +#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ + #define RCU_SCHEDULER_INACTIVE 0 #define RCU_SCHEDULER_INIT 1 #define RCU_SCHEDULER_RUNNING 2 diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index b7d5f2757053..6961a1b5b783 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -242,7 +242,6 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) static void cblist_init_generic(struct rcu_tasks *rtp) { int cpu; - unsigned long flags; int lim; int shift; @@ -268,10 +267,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) WARN_ON_ONCE(!rtpcp); if (cpu) raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); - local_irq_save(flags); // serialize initialization if (rcu_segcblist_empty(&rtpcp->cblist)) rcu_segcblist_init(&rtpcp->cblist); - local_irq_restore(flags); INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); rtpcp->cpu = cpu; rtpcp->rtpp = rtp; @@ -1120,7 +1117,6 @@ module_param(rcu_tasks_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_kthread(void) { - cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; if (rcu_tasks_lazy_ms >= 0) @@ -1284,7 +1280,6 @@ module_param(rcu_tasks_rude_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_rude_kthread(void) { - cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; if (rcu_tasks_rude_lazy_ms >= 0) rcu_tasks_rude.lazy_jiffies = msecs_to_jiffies(rcu_tasks_rude_lazy_ms); @@ -1916,7 +1911,6 @@ module_param(rcu_tasks_trace_lazy_ms, int, 0444); static int __init rcu_spawn_tasks_trace_kthread(void) { - cblist_init_generic(&rcu_tasks_trace); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; rcu_tasks_trace.init_fract = HZ / 10; @@ -2088,6 +2082,24 @@ late_initcall(rcu_tasks_verify_schedule_work); static void rcu_tasks_initiate_self_tests(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ +void __init tasks_cblist_init_generic(void) +{ + lockdep_assert_irqs_disabled(); + WARN_ON(num_online_cpus() > 1); + +#ifdef CONFIG_TASKS_RCU + cblist_init_generic(&rcu_tasks); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + cblist_init_generic(&rcu_tasks_rude); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + cblist_init_generic(&rcu_tasks_trace); +#endif +} + void __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index fec804b79080..705c0d16850a 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -261,4 +261,5 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); rcu_early_boot_tests(); + tasks_cblist_init_generic(); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2bccfd37c38..ba9137f39d14 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -5165,6 +5165,8 @@ void __init rcu_init(void) (void)start_poll_synchronize_rcu_expedited(); rcu_test_sync_prims(); + + tasks_cblist_init_generic(); } #include "tree_stall.h" From 46faf9d8e1d52e4a91c382c6c72da6bd8e68297b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 5 Feb 2024 13:10:19 -0800 Subject: [PATCH 31/34] rcu-tasks: Initialize data to eliminate RCU-tasks/do_exit() deadlocks Holding a mutex across synchronize_rcu_tasks() and acquiring that same mutex in code called from do_exit() after its call to exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() results in deadlock. This is by design, because tasks that are far enough into do_exit() are no longer present on the tasks list, making it a bit difficult for RCU Tasks to find them, let alone wait on them to do a voluntary context switch. However, such deadlocks are becoming more frequent. In addition, lockdep currently does not detect such deadlocks and they can be difficult to reproduce. In addition, if a task voluntarily context switches during that time (for example, if it blocks acquiring a mutex), then this task is in an RCU Tasks quiescent state. And with some adjustments, RCU Tasks could just as well take advantage of that fact. This commit therefore initializes the data structures that will be needed to rely on these quiescent states and to eliminate these deadlocks. Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/ Reported-by: Chen Zhongjin Reported-by: Yang Jihong Signed-off-by: Paul E. McKenney Tested-by: Yang Jihong Tested-by: Chen Zhongjin Reviewed-by: Frederic Weisbecker Signed-off-by: Boqun Feng --- init/init_task.c | 1 + kernel/fork.c | 1 + kernel/rcu/tasks.h | 2 ++ 3 files changed, 4 insertions(+) diff --git a/init/init_task.c b/init/init_task.c index 7ecb458eb3da..4daee6d761c8 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -147,6 +147,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .rcu_tasks_holdout = false, .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, + .rcu_tasks_exit_list = LIST_HEAD_INIT(init_task.rcu_tasks_exit_list), #endif #ifdef CONFIG_TASKS_TRACE_RCU .trc_reader_nesting = 0, diff --git a/kernel/fork.c b/kernel/fork.c index 0d944e92a43f..af7203be1d2d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1976,6 +1976,7 @@ static inline void rcu_copy_process(struct task_struct *p) p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; + INIT_LIST_HEAD(&p->rcu_tasks_exit_list); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 6961a1b5b783..edd14fee48c5 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -274,6 +274,8 @@ static void cblist_init_generic(struct rcu_tasks *rtp) rtpcp->rtpp = rtp; if (!rtpcp->rtp_blkd_tasks.next) INIT_LIST_HEAD(&rtpcp->rtp_blkd_tasks); + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); } pr_info("%s: Setting shift to %d and lim to %d rcu_task_cb_adjust=%d.\n", rtp->name, From 6b70399f9ef3809f6e308fd99dd78b072c1bd05c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Feb 2024 11:28:45 -0800 Subject: [PATCH 32/34] rcu-tasks: Maintain lists to eliminate RCU-tasks/do_exit() deadlocks This commit continues the elimination of deadlocks involving do_exit() and RCU tasks by causing exit_tasks_rcu_start() to add the current task to a per-CPU list and causing exit_tasks_rcu_stop() to remove the current task from whatever list it is on. These lists will be used to track tasks that are exiting, while still accounting for any RCU-tasks quiescent states that these tasks pass though. [ paulmck: Apply Frederic Weisbecker feedback. ] Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/ Reported-by: Chen Zhongjin Reported-by: Yang Jihong Signed-off-by: Paul E. McKenney Tested-by: Yang Jihong Tested-by: Chen Zhongjin Reviewed-by: Frederic Weisbecker Signed-off-by: Boqun Feng --- kernel/rcu/tasks.h | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index edd14fee48c5..9e4122497b9f 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1147,25 +1147,48 @@ struct task_struct *get_rcu_tasks_gp_kthread(void) EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); /* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. + * Protect against tasklist scan blind spot while the task is exiting and + * may be removed from the tasklist. Do this by adding the task to yet + * another list. + * + * Note that the task will remove itself from this list, so there is no + * need for get_task_struct(), except in the case where rcu_tasks_pertask() + * adds it to the holdout list, in which case rcu_tasks_pertask() supplies + * the needed get_task_struct(). */ -void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) +void exit_tasks_rcu_start(void) { - current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu); + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + struct task_struct *t = current; + + WARN_ON_ONCE(!list_empty(&t->rcu_tasks_exit_list)); + preempt_disable(); + rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); + t->rcu_tasks_exit_cpu = smp_processor_id(); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (!rtpcp->rtp_exit_list.next) + INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + preempt_enable(); } /* - * Contribute to protect against tasklist scan blind spot while the - * task is exiting and may be removed from the tasklist. See - * corresponding synchronize_srcu() for further details. + * Remove the task from the "yet another list" because do_exit() is now + * non-preemptible, allowing synchronize_rcu() to wait beyond this point. */ -void exit_tasks_rcu_stop(void) __releases(&tasks_rcu_exit_srcu) +void exit_tasks_rcu_stop(void) { + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; struct task_struct *t = current; - __srcu_read_unlock(&tasks_rcu_exit_srcu, t->rcu_tasks_idx); + WARN_ON_ONCE(list_empty(&t->rcu_tasks_exit_list)); + rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, t->rcu_tasks_exit_cpu); + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + list_del_init(&t->rcu_tasks_exit_list); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); } /* From 1612160b91272f5b1596f499584d6064bf5be794 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 2 Feb 2024 11:49:06 -0800 Subject: [PATCH 33/34] rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks Holding a mutex across synchronize_rcu_tasks() and acquiring that same mutex in code called from do_exit() after its call to exit_tasks_rcu_start() but before its call to exit_tasks_rcu_stop() results in deadlock. This is by design, because tasks that are far enough into do_exit() are no longer present on the tasks list, making it a bit difficult for RCU Tasks to find them, let alone wait on them to do a voluntary context switch. However, such deadlocks are becoming more frequent. In addition, lockdep currently does not detect such deadlocks and they can be difficult to reproduce. In addition, if a task voluntarily context switches during that time (for example, if it blocks acquiring a mutex), then this task is in an RCU Tasks quiescent state. And with some adjustments, RCU Tasks could just as well take advantage of that fact. This commit therefore eliminates these deadlock by replacing the SRCU-based wait for do_exit() completion with per-CPU lists of tasks currently exiting. A given task will be on one of these per-CPU lists for the same period of time that this task would previously have been in the previous SRCU read-side critical section. These lists enable RCU Tasks to find the tasks that have already been removed from the tasks list, but that must nevertheless be waited upon. The RCU Tasks grace period gathers any of these do_exit() tasks that it must wait on, and adds them to the list of holdouts. Per-CPU locking and get_task_struct() are used to synchronize addition to and removal from these lists. Link: https://lore.kernel.org/all/20240118021842.290665-1-chenzhongjin@huawei.com/ Reported-by: Chen Zhongjin Reported-by: Yang Jihong Signed-off-by: Paul E. McKenney Tested-by: Yang Jihong Tested-by: Chen Zhongjin Reviewed-by: Frederic Weisbecker Signed-off-by: Boqun Feng --- kernel/rcu/tasks.h | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9e4122497b9f..c61dc92537db 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -146,8 +146,6 @@ static struct rcu_tasks rt_name = \ } #ifdef CONFIG_TASKS_RCU -/* Track exiting tasks in order to allow them to be waited for. */ -DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu); /* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); @@ -852,10 +850,12 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // number of voluntary context switches, and add that task to the // holdout list. // rcu_tasks_postscan(): -// Invoke synchronize_srcu() to ensure that all tasks that were -// in the process of exiting (and which thus might not know to -// synchronize with this RCU Tasks grace period) have completed -// exiting. +// Gather per-CPU lists of tasks in do_exit() to ensure that all +// tasks that were in the process of exiting (and which thus might +// not know to synchronize with this RCU Tasks grace period) have +// completed exiting. The synchronize_rcu() in rcu_tasks_postgp() +// will take care of any tasks stuck in the non-preemptible region +// of do_exit() following its call to exit_tasks_rcu_stop(). // check_all_holdout_tasks(), repeatedly until holdout list is empty: // Scans the holdout list, attempting to identify a quiescent state // for each task on the list. If there is a quiescent state, the @@ -868,8 +868,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // with interrupts disabled. // // For each exiting task, the exit_tasks_rcu_start() and -// exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU -// read-side critical sections waited for by rcu_tasks_postscan(). +// exit_tasks_rcu_finish() functions add and remove, respectively, the +// current task to a per-CPU list of tasks that rcu_tasks_postscan() must +// wait on. This is necessary because rcu_tasks_postscan() must wait on +// tasks that have already been removed from the global list of tasks. // // Pre-grace-period update-side code is ordered before the grace // via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code @@ -933,9 +935,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop) } } +void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); +DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); + /* Processing between scanning taskslist and draining the holdout list. */ static void rcu_tasks_postscan(struct list_head *hop) { + int cpu; int rtsi = READ_ONCE(rcu_task_stall_info); if (!IS_ENABLED(CONFIG_TINY_RCU)) { @@ -949,9 +955,9 @@ static void rcu_tasks_postscan(struct list_head *hop) * this, divide the fragile exit path part in two intersecting * read side critical sections: * - * 1) An _SRCU_ read side starting before calling exit_notify(), - * which may remove the task from the tasklist, and ending after - * the final preempt_disable() call in do_exit(). + * 1) A task_struct list addition before calling exit_notify(), + * which may remove the task from the tasklist, with the + * removal after the final preempt_disable() call in do_exit(). * * 2) An _RCU_ read side starting with the final preempt_disable() * call in do_exit() and ending with the final call to schedule() @@ -960,7 +966,17 @@ static void rcu_tasks_postscan(struct list_head *hop) * This handles the part 1). And postgp will handle part 2) with a * call to synchronize_rcu(). */ - synchronize_srcu(&tasks_rcu_exit_srcu); + + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu); + struct task_struct *t; + + raw_spin_lock_irq_rcu_node(rtpcp); + list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) + if (list_empty(&t->rcu_tasks_holdout_list)) + rcu_tasks_pertask(t, hop); + raw_spin_unlock_irq_rcu_node(rtpcp); + } if (!IS_ENABLED(CONFIG_TINY_RCU)) del_timer_sync(&tasks_rcu_exit_srcu_stall_timer); @@ -1028,7 +1044,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) * * In addition, this synchronize_rcu() waits for exiting tasks * to complete their final preempt_disable() region of execution, - * cleaning up after synchronize_srcu(&tasks_rcu_exit_srcu), * enforcing the whole region before tasklist removal until * the final schedule() with TASK_DEAD state to be an RCU TASKS * read side critical section. @@ -1036,9 +1051,6 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp) synchronize_rcu(); } -void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func); -DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks"); - static void tasks_rcu_exit_srcu_stall(struct timer_list *unused) { #ifndef CONFIG_TINY_RCU From 0bb11a372fc8d7006b4d0f42a2882939747bdbff Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 1 Feb 2024 06:10:26 -0800 Subject: [PATCH 34/34] rcu-tasks: Maintain real-time response in rcu_tasks_postscan() The current code will scan the entirety of each per-CPU list of exiting tasks in ->rtp_exit_list with interrupts disabled. This is normally just fine, because each CPU typically won't have very many tasks in this state. However, if a large number of tasks block late in do_exit(), these lists could be arbitrarily long. Low probability, perhaps, but it really could happen. This commit therefore occasionally re-enables interrupts while traversing these lists, inserting a dummy element to hold the current place in the list. In kernels built with CONFIG_PREEMPT_RT=y, this re-enabling happens after each list element is processed, otherwise every one-to-two jiffies. [ paulmck: Apply Frederic Weisbecker feedback. ] Link: https://lore.kernel.org/all/ZdeI_-RfdLR8jlsm@localhost.localdomain/ Signed-off-by: Paul E. McKenney Cc: Thomas Gleixner Cc: Sebastian Siewior Cc: Anna-Maria Behnsen Cc: Steven Rostedt Signed-off-by: Boqun Feng --- kernel/rcu/tasks.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index c61dc92537db..147b5945d67a 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -968,13 +968,33 @@ static void rcu_tasks_postscan(struct list_head *hop) */ for_each_possible_cpu(cpu) { + unsigned long j = jiffies + 1; struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rcu_tasks.rtpcpu, cpu); struct task_struct *t; + struct task_struct *t1; + struct list_head tmp; raw_spin_lock_irq_rcu_node(rtpcp); - list_for_each_entry(t, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) + list_for_each_entry_safe(t, t1, &rtpcp->rtp_exit_list, rcu_tasks_exit_list) { if (list_empty(&t->rcu_tasks_holdout_list)) rcu_tasks_pertask(t, hop); + + // RT kernels need frequent pauses, otherwise + // pause at least once per pair of jiffies. + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && time_before(jiffies, j)) + continue; + + // Keep our place in the list while pausing. + // Nothing else traverses this list, so adding a + // bare list_head is OK. + list_add(&tmp, &t->rcu_tasks_exit_list); + raw_spin_unlock_irq_rcu_node(rtpcp); + cond_resched(); // For CONFIG_PREEMPT=n kernels + raw_spin_lock_irq_rcu_node(rtpcp); + t1 = list_entry(tmp.next, struct task_struct, rcu_tasks_exit_list); + list_del(&tmp); + j = jiffies + 1; + } raw_spin_unlock_irq_rcu_node(rtpcp); }