linux-stable/kernel/sched/pelt.h
Vincent Donnefort e2f3e35f1f sched/fair: Decay task PELT values during wakeup migration
Before being migrated to a new CPU, a task sees its PELT values
synchronized with rq last_update_time. Once done, that same task will also
have its sched_avg last_update_time reset. This means the time between
the migration and the last clock update will not be accounted for in
util_avg and a discontinuity will appear. This issue is amplified by the
PELT clock scaling. It takes currently one tick after the CPU being idle
to let clock_pelt catching up clock_task.

This is especially problematic for asymmetric CPU capacity systems which
need stable util_avg signals for task placement and energy estimation.

Ideally, this problem would be solved by updating the runqueue clocks
before the migration. But that would require taking the runqueue lock
which is quite expensive [1]. Instead estimate the missing time and update
the task util_avg with that value.

To that end, we need sched_clock_cpu() but it is a costly function. Limit
the usage to the case where the source CPU is idle as we know this is when
the clock is having the biggest risk of being outdated.

See comment in migrate_se_pelt_lag() for more details about how the PELT
value is estimated. Notice though this estimation doesn't take into account
IRQ and Paravirt time.

[1] https://lkml.kernel.org/r/20190709115759.10451-1-chris.redpath@arm.com

Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com>
Signed-off-by: Vincent Donnefort <vdonnefort@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Lukasz Luba <lukasz.luba@arm.com>
Link: https://lkml.kernel.org/r/20220621090414.433602-3-vdonnefort@google.com
2022-06-28 09:17:46 +02:00

235 lines
6 KiB
C

#ifdef CONFIG_SMP
#include "sched-pelt.h"
int __update_load_avg_blocked_se(u64 now, struct sched_entity *se);
int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se);
int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
#ifdef CONFIG_SCHED_THERMAL_PRESSURE
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
static inline u64 thermal_load_avg(struct rq *rq)
{
return READ_ONCE(rq->avg_thermal.load_avg);
}
#else
static inline int
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
static inline u64 thermal_load_avg(struct rq *rq)
{
return 0;
}
#endif
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
#endif
#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
static inline u32 get_pelt_divider(struct sched_avg *avg)
{
return PELT_MIN_DIVIDER + avg->period_contrib;
}
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Avoid store if the flag has been already reset */
enqueued = avg->util_est.enqueued;
if (!(enqueued & UTIL_AVG_UNCHANGED))
return;
/* Reset flag to report util_avg has been updated */
enqueued &= ~UTIL_AVG_UNCHANGED;
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
static inline u64 rq_clock_pelt(struct rq *rq)
{
lockdep_assert_rq_held(rq);
assert_clock_updated(rq);
return rq->clock_pelt - rq->lost_idle_time;
}
/* The rq is idle, we can sync to clock_task */
static inline void _update_idle_rq_clock_pelt(struct rq *rq)
{
rq->clock_pelt = rq_clock_task(rq);
u64_u32_store(rq->clock_idle, rq_clock(rq));
/* Paired with smp_rmb in migrate_se_pelt_lag() */
smp_wmb();
u64_u32_store(rq->clock_pelt_idle, rq_clock_pelt(rq));
}
/*
* The clock_pelt scales the time to reflect the effective amount of
* computation done during the running delta time but then sync back to
* clock_task when rq is idle.
*
*
* absolute time | 1| 2| 3| 4| 5| 6| 7| 8| 9|10|11|12|13|14|15|16
* @ max capacity ------******---------------******---------------
* @ half capacity ------************---------************---------
* clock pelt | 1| 2| 3| 4| 7| 8| 9| 10| 11|14|15|16
*
*/
static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
{
if (unlikely(is_idle_task(rq->curr))) {
_update_idle_rq_clock_pelt(rq);
return;
}
/*
* When a rq runs at a lower compute capacity, it will need
* more time to do the same amount of work than at max
* capacity. In order to be invariant, we scale the delta to
* reflect how much work has been really done.
* Running longer results in stealing idle time that will
* disturb the load signal compared to max capacity. This
* stolen idle time will be automatically reflected when the
* rq will be idle and the clock will be synced with
* rq_clock_task.
*/
/*
* Scale the elapsed time to reflect the real amount of
* computation
*/
delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
rq->clock_pelt += delta;
}
/*
* When rq becomes idle, we have to check if it has lost idle time
* because it was fully busy. A rq is fully used when the /Sum util_sum
* is greater or equal to:
* (LOAD_AVG_MAX - 1024 + rq->cfs.avg.period_contrib) << SCHED_CAPACITY_SHIFT;
* For optimization and computing rounding purpose, we don't take into account
* the position in the current window (period_contrib) and we use the higher
* bound of util_sum to decide.
*/
static inline void update_idle_rq_clock_pelt(struct rq *rq)
{
u32 divider = ((LOAD_AVG_MAX - 1024) << SCHED_CAPACITY_SHIFT) - LOAD_AVG_MAX;
u32 util_sum = rq->cfs.avg.util_sum;
util_sum += rq->avg_rt.util_sum;
util_sum += rq->avg_dl.util_sum;
/*
* Reflecting stolen time makes sense only if the idle
* phase would be present at max capacity. As soon as the
* utilization of a rq has reached the maximum value, it is
* considered as an always running rq without idle time to
* steal. This potential idle time is considered as lost in
* this case. We keep track of this lost idle time compare to
* rq's clock_task.
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
_update_idle_rq_clock_pelt(rq);
}
#ifdef CONFIG_CFS_BANDWIDTH
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
u64 throttled;
if (unlikely(cfs_rq->throttle_count))
throttled = U64_MAX;
else
throttled = cfs_rq->throttled_clock_pelt_time;
u64_u32_store(cfs_rq->throttled_pelt_idle, throttled);
}
/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
return rq_clock_pelt(rq_of(cfs_rq));
}
#endif
#else
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
static inline int
update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
return 0;
}
static inline int
update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
return 0;
}
static inline int
update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
static inline u64 thermal_load_avg(struct rq *rq)
{
return 0;
}
static inline int
update_irq_load_avg(struct rq *rq, u64 running)
{
return 0;
}
static inline u64 rq_clock_pelt(struct rq *rq)
{
return rq_clock_task(rq);
}
static inline void
update_rq_clock_pelt(struct rq *rq, s64 delta) { }
static inline void
update_idle_rq_clock_pelt(struct rq *rq) { }
static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { }
#endif