A couple of scheduler fixes:

- Make the conditional update of the overutilized state work correctly by
    caching the relevant flags state before overwriting them and checking
    them afterwards.
 
  - Fix a data race in the wakeup path which caused loadavg on ARM64
    platforms to become a random number generator.
 
  - Fix the ordering of the iowaiter accounting operations so it can't be
    decremented before it is incremented.
 
  - Fix a bug in the deadline scheduler vs. priority inheritance when a
    non-deadline task A has inherited the parameters of a deadline task B
    and then blocks on a non-deadline task C.
 
    The second inheritance step used the static deadline parameters of task
    A, which are usually 0, instead of further propagating task B's
    parameters. The zero initialized parameters trigger a bug in the
    deadline scheduler.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl+6edsTHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoaJCEAC7VGr9IlWRzCI/173tKAXkLRrGXHVb
 yOYc/YjLMCTcERNxqpf8uIURd/ATSHU/RMwfFcB558NedKZ/QKZDoKmLqeCXnVeM
 e20tXv/fmpqRS7lgtmbBfhQ8mSDhst960oD1mHifdEwEBCCm7mLEaipTuTWjnZ0x
 rOz70Hir1mSjsP0E7ZorsxCr1yExbrt+jZfKCe9D2kUSvlWHf1ipzAYNlqb/DsfG
 n81G7q9LYV8NUhX3lt8oSZDq0K44aO6G6fEaP4EkfwsIAOh37yPHwuEuqDZCBmXw
 rQ17XUU3jQ2MtubPvVEKG/6Z+hAUyOsAKynpq/RhzueXQm/9Ns6+qHX/xY8yh39y
 S5qPd5DLRlac8f7cFwz2zPxP5E+xTJLONgRkuN1XlitMJZBxru9AzDNa0/6on8TM
 OtvbvVR+bPUfHiHULk4fTz7fLcbgYgxbCgfGoFsVlfskOxnzgEG8WfuI2Up2rRJ0
 nr1MCER+5fprciqPPs+18rVEFiC4mQSrV01cnwrNbpW8pqibZSomMilQ0oQvcTGL
 VDEHkaDTa5YbR92Szq4rYbr7Sf0ihFU0EZUNVQnu7SujdVFxTdHb1yr8UYcYp09b
 LqGFhr1FHBNYKbw3rEPx2R/FGuCii21oQkhz94ujDo1Np8EGVZYwFGh+iwbsa2Xn
 K1u0HzqLTfTkMw==
 =HiGq
 -----END PGP SIGNATURE-----

Merge tag 'sched-urgent-2020-11-22' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:
 "A couple of scheduler fixes:

   - Make the conditional update of the overutilized state work
     correctly by caching the relevant flags state before overwriting
     them and checking them afterwards.

   - Fix a data race in the wakeup path which caused loadavg on ARM64
     platforms to become a random number generator.

   - Fix the ordering of the iowaiter accounting operations so it can't
     be decremented before it is incremented.

   - Fix a bug in the deadline scheduler vs. priority inheritance when a
     non-deadline task A has inherited the parameters of a deadline task
     B and then blocks on a non-deadline task C.

     The second inheritance step used the static deadline parameters of
     task A, which are usually 0, instead of further propagating task
     B's parameters. The zero initialized parameters trigger a bug in
     the deadline scheduler"

* tag 'sched-urgent-2020-11-22' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/deadline: Fix priority inheritance with multiple scheduling classes
  sched: Fix rq->nr_iowait ordering
  sched: Fix data-race in wakeup
  sched/fair: Fix overutilized update in enqueue_task_fair()
This commit is contained in:
Linus Torvalds 2020-11-22 13:26:07 -08:00
commit f4b936f5d6
4 changed files with 95 additions and 57 deletions

View File

@ -552,7 +552,6 @@ struct sched_dl_entity {
* overruns.
*/
unsigned int dl_throttled : 1;
unsigned int dl_boosted : 1;
unsigned int dl_yielded : 1;
unsigned int dl_non_contending : 1;
unsigned int dl_overrun : 1;
@ -571,6 +570,15 @@ struct sched_dl_entity {
* time.
*/
struct hrtimer inactive_timer;
#ifdef CONFIG_RT_MUTEXES
/*
* Priority Inheritance. When a DEADLINE scheduling entity is boosted
* pi_se points to the donor, otherwise points to the dl_se it belongs
* to (the original one/itself).
*/
struct sched_dl_entity *pi_se;
#endif
};
#ifdef CONFIG_UCLAMP_TASK
@ -770,7 +778,6 @@ struct task_struct {
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
unsigned sched_migrated:1;
unsigned sched_remote_wakeup:1;
#ifdef CONFIG_PSI
unsigned sched_psi_wake_requeue:1;
#endif
@ -780,6 +787,21 @@ struct task_struct {
/* Unserialized, strictly 'current' */
/*
* This field must not be in the scheduler word above due to wakelist
* queueing no longer being serialized by p->on_cpu. However:
*
* p->XXX = X; ttwu()
* schedule() if (p->on_rq && ..) // false
* smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true
* deactivate_task() ttwu_queue_wakelist())
* p->on_rq = 0; p->sched_remote_wakeup = Y;
*
* guarantees all stores of 'current' are visible before
* ->sched_remote_wakeup gets used, so it can be in this word.
*/
unsigned sched_remote_wakeup:1;
/* Bit to tell LSMs we're in execve(): */
unsigned in_execve:1;
unsigned in_iowait:1;

View File

@ -2501,7 +2501,12 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
#ifdef CONFIG_SMP
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
else
#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
activate_task(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, rf);
@ -2888,11 +2893,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
goto unlock;
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
@ -2963,6 +2963,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
@ -4907,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
} else {
p->dl.pi_se = &p->dl;
}
p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
p->sched_class = &fair_sched_class;

View File

@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
#ifdef CONFIG_RT_MUTEXES
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
return dl_se->pi_se;
}
static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return pi_of(dl_se) != dl_se;
}
#else
static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
{
return dl_se;
}
static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
{
return false;
}
#endif
#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
WARN_ON(dl_se->dl_boosted);
WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* could happen are, typically, a entity voluntarily trying to overcome its
* runtime, or it just underestimated it during sched_setattr().
*/
static void replenish_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
static void replenish_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
BUG_ON(pi_se->dl_runtime <= 0);
BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
*/
if (dl_se->dl_deadline == 0) {
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
if (dl_se->dl_yielded && dl_se->runtime > 0)
@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* arbitrary large.
*/
while (dl_se->runtime <= 0) {
dl_se->deadline += pi_se->dl_period;
dl_se->runtime += pi_se->dl_runtime;
dl_se->deadline += pi_of(dl_se)->dl_period;
dl_se->runtime += pi_of(dl_se)->dl_runtime;
}
/*
@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n");
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
if (dl_se->dl_yielded)
@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* task with deadline equal to period this is the same of using
* dl_period instead of dl_deadline in the equation above.
*/
static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, u64 t)
static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
{
u64 left, right;
@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
(pi_se->dl_runtime >> DL_SCALE);
(pi_of(dl_se)->dl_runtime >> DL_SCALE);
return dl_time_before(right, left);
}
@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* Please refer to the comments update_dl_revised_wakeup() function to find
* more about the Revised CBS rule.
*/
static void update_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
static void update_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
dl_entity_overflow(dl_se, rq_clock(rq))) {
if (unlikely(!dl_is_implicit(dl_se) &&
!dl_time_before(dl_se->deadline, rq_clock(rq)) &&
!dl_se->dl_boosted)){
!is_dl_boosted(dl_se))) {
update_dl_revised_wakeup(dl_se, rq);
return;
}
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
}
@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
*/
if (dl_se->dl_boosted)
if (is_dl_boosted(dl_se))
goto unlock;
/*
@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* but do not enqueue -- wait for our wakeup to do that.
*/
if (!task_on_rq_queued(p)) {
replenish_dl_entity(dl_se, dl_se);
replenish_dl_entity(dl_se);
goto unlock;
}
@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
return;
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
@ -1287,7 +1306,7 @@ throttle:
dl_se->dl_overrun = 1;
__dequeue_task_dl(rq, curr, 0);
if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
}
static void
enqueue_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, int flags)
enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
BUG_ON(on_dl_rq(dl_se));
@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
*/
if (flags & ENQUEUE_WAKEUP) {
task_contending(dl_se, flags);
update_dl_entity(dl_se, pi_se);
update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) &&
dl_time_before(dl_se->deadline,
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
struct sched_dl_entity *pi_se = &p->dl;
/*
* Use the scheduling parameters of the top pi-waiter task if:
* - we have a top pi-waiter which is a SCHED_DEADLINE task AND
* - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
* smaller than our deadline OR we are a !SCHED_DEADLINE task getting
* boosted due to a SCHED_DEADLINE pi-waiter).
* Otherwise we keep our runtime and deadline.
*/
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
if (is_dl_boosted(&p->dl)) {
/*
* Because of delays in the detection of the overrun of a
* thread's runtime, it might be the case that a thread
@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* the throttle.
*/
p->dl.dl_throttled = 0;
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
return;
}
@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
return;
}
enqueue_dl_entity(&p->dl, pi_se, flags);
enqueue_dl_entity(&p->dl, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
#ifdef CONFIG_RT_MUTEXES
dl_se->pi_se = dl_se;
#endif
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)

View File

@ -5477,6 +5477,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);
/*
* The code below (indirectly) updates schedutil which looks at
@ -5549,7 +5550,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
if (flags & ENQUEUE_WAKEUP)
if (!task_new)
update_overutilized_status(rq);
enqueue_throttle: