From 962aff03c315b508d980422db5b49b49e4382119 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 2 May 2018 12:49:21 -0700
Subject: [PATCH] rcu: Clean up handling of tasks blocked across full-rcu_node
 offline

Commit 0aa04b055e71 ("rcu: Process offlining and onlining only at
grace-period start") deferred handling of CPU-hotplug events until the
start of the next grace period, but consider the following sequence
of events:

1.	A task is preempted within an RCU-preempt read-side critical
	section.

2.	The CPU that this task was running on goes offline, along with all
	other CPUs sharing the corresponding leaf rcu_node structure.

3.	The task resumes execution.

4.	One of those CPUs comes back online before a new grace period starts.

In step 2, the code in the next rcu_gp_init() invocation will (correctly)
defer removing the leaf rcu_node structure from the upper-level bitmasks,
and will (correctly) set that structure's ->wait_blkd_tasks field.  During
the ensuing interval, RCU will (correctly) track the tasks preempted on
that structure because they must block any subsequent grace period.

In step 3, the code in rcu_read_unlock_special() will (correctly) remove
the task from the leaf rcu_node structure.  From this point forward, RCU
need not pay attention to this structure, at least not until one of the
corresponding CPUs comes back online.

In step 4, the code in the next rcu_gp_init() invocation will
(incorrectly) invoke rcu_init_new_rnp().  This is incorrect because
the corresponding rcu_cleanup_dead_rnp() was never invoked.  This is
nevertheless harmless because the upper-level bits are still set.
So, no harm, no foul, right?

At least, all is well until a little further into rcu_gp_init()
invocation, which will notice that there are no longer any tasks blocked
on the leaf rcu_node structure, conclude that there is no longer anything
left over from step 2's offline operation, and will therefore invoke
rcu_cleanup_dead_rnp().  But this invocation of rcu_cleanup_dead_rnp()
is for the beginning of the earlier offline interval, and the previous
invocation of rcu_init_new_rnp() is for the end of that same interval.
That is right, they are invoked out of order.

That cannot be good, can it?

It turns out that this is not a (correctness!) problem because
rcu_cleanup_dead_rnp() checks to see if any of the corresponding CPUs
are online, and refuses to do anything if so.  In other words, in the
case where rcu_init_new_rnp() and rcu_cleanup_dead_rnp() execute out of
order, they both have no effect.

But this is at best an accident waiting to happen.

This commit therefore adds logic to rcu_gp_init() so that
rcu_init_new_rnp() and rcu_cleanup_dead_rnp() are always invoked in
order, and so that neither are invoked at all in cases where RCU had to
pay attention to the leaf rcu_node structure during the entire time that
all corresponding CPUs were offline.

And, while in the area, this commit reduces confusion by using formal
parameters rather than local variables that just happen to have the same
value at that particular point in the code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a6863b813f0c..9a5ba6db7b60 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1909,12 +1909,14 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 
 		/* If zero-ness of ->qsmaskinit changed, propagate up tree. */
 		if (!oldmask != !rnp->qsmaskinit) {
-			if (!oldmask) /* First online CPU for this rcu_node. */
-				rcu_init_new_rnp(rnp);
-			else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
-				rnp->wait_blkd_tasks = true;
-			else /* Last offline CPU and can propagate. */
+			if (!oldmask) { /* First online CPU for rcu_node. */
+				if (!rnp->wait_blkd_tasks) /* Ever offline? */
+					rcu_init_new_rnp(rnp);
+			} else if (rcu_preempt_has_tasks(rnp)) {
+				rnp->wait_blkd_tasks = true; /* blocked tasks */
+			} else { /* Last offline CPU and can propagate. */
 				rcu_cleanup_dead_rnp(rnp);
+			}
 		}
 
 		/*
@@ -1923,14 +1925,13 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 		 * still offline, propagate up the rcu_node tree and
 		 * clear ->wait_blkd_tasks.  Otherwise, if one of this
 		 * rcu_node structure's CPUs has since come back online,
-		 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
-		 * checks for this, so just call it unconditionally).
+		 * simply clear ->wait_blkd_tasks.
 		 */
 		if (rnp->wait_blkd_tasks &&
-		    (!rcu_preempt_has_tasks(rnp) ||
-		     rnp->qsmaskinit)) {
+		    (!rcu_preempt_has_tasks(rnp) || rnp->qsmaskinit)) {
 			rnp->wait_blkd_tasks = false;
-			rcu_cleanup_dead_rnp(rnp);
+			if (!rnp->qsmaskinit)
+				rcu_cleanup_dead_rnp(rnp);
 		}
 
 		raw_spin_unlock_irq_rcu_node(rnp);
@@ -2450,9 +2451,10 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	long mask;
 	struct rcu_node *rnp = rnp_leaf;
 
-	raw_lockdep_assert_held_rcu_node(rnp);
+	raw_lockdep_assert_held_rcu_node(rnp_leaf);
 	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
-	    rnp->qsmaskinit || rcu_preempt_has_tasks(rnp))
+	    WARN_ON_ONCE(rnp_leaf->qsmaskinit) ||
+	    WARN_ON_ONCE(rcu_preempt_has_tasks(rnp_leaf)))
 		return;
 	for (;;) {
 		mask = rnp->grpmask;
@@ -2461,7 +2463,8 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 			break;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
 		rnp->qsmaskinit &= ~mask;
-		rnp->qsmask &= ~mask;
+		/* Between grace periods, so better already be zero! */
+		WARN_ON_ONCE(rnp->qsmask);
 		if (rnp->qsmaskinit) {
 			raw_spin_unlock_rcu_node(rnp);
 			/* irqs remain disabled. */
@@ -3479,6 +3482,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 	struct rcu_node *rnp = rnp_leaf;
 
 	raw_lockdep_assert_held_rcu_node(rnp);
+	WARN_ON_ONCE(rnp->wait_blkd_tasks);
 	for (;;) {
 		mask = rnp->grpmask;
 		rnp = rnp->parent;