diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 46d669297b1f..0ef7e0a67089 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6702,61 +6702,96 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) } /* - * compute_energy(): Estimates the energy that @pd would consume if @p was - * migrated to @dst_cpu. compute_energy() predicts what will be the utilization - * landscape of @pd's CPUs after the task migration, and uses the Energy Model - * to compute what would be the energy if we decided to actually migrate that - * task. + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test the + * placement. Given by eenv_task_busy_time(). + * @pd_busy_time: Utilization of the whole perf domain without the task + * contribution. Given by eenv_pd_busy_time(). + * @cpu_cap: Maximum CPU capacity for the perf domain. + * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). */ -static long -compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus, - struct perf_domain *pd) +struct energy_env { + unsigned long task_busy_time; + unsigned long pd_busy_time; + unsigned long cpu_cap; + unsigned long pd_cap; +}; + +/* + * Compute the task busy time for compute_energy(). This time cannot be + * injected directly into effective_cpu_util() because of the IRQ scaling. + * The latter only makes sense with the most recent CPUs where the task has + * run. + */ +static inline void eenv_task_busy_time(struct energy_env *eenv, + struct task_struct *p, int prev_cpu) { - unsigned long max_util = 0, sum_util = 0, cpu_cap; + unsigned long busy_time, max_cap = arch_scale_cpu_capacity(prev_cpu); + unsigned long irq = cpu_util_irq(cpu_rq(prev_cpu)); + + if (unlikely(irq >= max_cap)) + busy_time = max_cap; + else + busy_time = scale_irq_capacity(task_util_est(p), irq, max_cap); + + eenv->task_busy_time = busy_time; +} + +/* + * Compute the perf_domain (PD) busy time for compute_energy(). Based on the + * utilization for each @pd_cpus, it however doesn't take into account + * clamping since the ratio (utilization / cpu_capacity) is already enough to + * scale the EM reported power consumption at the (eventually clamped) + * cpu_capacity. + * + * The contribution of the task @p for which we want to estimate the + * energy cost is removed (by cpu_util_next()) and must be calculated + * separately (see eenv_task_busy_time). This ensures: + * + * - A stable PD utilization, no matter which CPU of that PD we want to place + * the task on. + * + * - A fair comparison between CPUs as the task contribution (task_util()) + * will always be the same no matter which CPU utilization we rely on + * (util_avg or util_est). + * + * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't + * exceed @eenv->pd_cap. + */ +static inline void eenv_pd_busy_time(struct energy_env *eenv, + struct cpumask *pd_cpus, + struct task_struct *p) +{ + unsigned long busy_time = 0; int cpu; - cpu_cap = arch_scale_cpu_capacity(cpumask_first(cpus)); - cpu_cap -= arch_scale_thermal_pressure(cpumask_first(cpus)); + for_each_cpu(cpu, pd_cpus) { + unsigned long util = cpu_util_next(cpu, p, -1); - /* - * The capacity state of CPUs of the current rd can be driven by CPUs - * of another rd if they belong to the same pd. So, account for the - * utilization of these CPUs too by masking pd with cpu_online_mask - * instead of the rd span. - * - * If an entire pd is outside of the current rd, it will not appear in - * its pd list and will not be accounted by compute_energy(). - */ - for_each_cpu(cpu, cpus) { - unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu); - unsigned long cpu_util, util_running = util_freq; - struct task_struct *tsk = NULL; + busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); + } - /* - * When @p is placed on @cpu: - * - * util_running = max(cpu_util, cpu_util_est) + - * max(task_util, _task_util_est) - * - * while cpu_util_next is: max(cpu_util + task_util, - * cpu_util_est + _task_util_est) - */ - if (cpu == dst_cpu) { - tsk = p; - util_running = - cpu_util_next(cpu, p, -1) + task_util_est(p); - } + eenv->pd_busy_time = min(eenv->pd_cap, busy_time); +} - /* - * Busy time computation: utilization clamping is not - * required since the ratio (sum_util / cpu_capacity) - * is already enough to scale the EM reported power - * consumption at the (eventually clamped) cpu_capacity. - */ - cpu_util = effective_cpu_util(cpu, util_running, ENERGY_UTIL, - NULL); +/* + * Compute the maximum utilization for compute_energy() when the task @p + * is placed on the cpu @dst_cpu. + * + * Returns the maximum utilization among @eenv->cpus. This utilization can't + * exceed @eenv->cpu_cap. + */ +static inline unsigned long +eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, + struct task_struct *p, int dst_cpu) +{ + unsigned long max_util = 0; + int cpu; - sum_util += min(cpu_util, cpu_cap); + for_each_cpu(cpu, pd_cpus) { + struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; + unsigned long util = cpu_util_next(cpu, p, dst_cpu); + unsigned long cpu_util; /* * Performance domain frequency: utilization clamping @@ -6765,12 +6800,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct cpumask *cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = effective_cpu_util(cpu, util_freq, FREQUENCY_UTIL, - tsk); - max_util = max(max_util, min(cpu_util, cpu_cap)); + cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + max_util = max(max_util, cpu_util); } - return em_cpu_energy(pd->em_pd, max_util, sum_util, cpu_cap); + return min(max_util, eenv->cpu_cap); +} + +/* + * compute_energy(): Use the Energy Model to estimate the energy that @pd would + * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task + * contribution is ignored. + */ +static inline unsigned long +compute_energy(struct energy_env *eenv, struct perf_domain *pd, + struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) +{ + unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); + unsigned long busy_time = eenv->pd_busy_time; + + if (dst_cpu >= 0) + busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); + + return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); } /* @@ -6816,11 +6868,12 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; - struct root_domain *rd = cpu_rq(smp_processor_id())->rd; int cpu, best_energy_cpu = prev_cpu, target = -1; - unsigned long cpu_cap, util, base_energy = 0; + struct root_domain *rd = this_rq()->rd; + unsigned long base_energy = 0; struct sched_domain *sd; struct perf_domain *pd; + struct energy_env eenv; rcu_read_lock(); pd = rcu_dereference(rd->pd); @@ -6843,22 +6896,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!task_util_est(p)) goto unlock; + eenv_task_busy_time(&eenv, p, prev_cpu); + for (; pd; pd = pd->next) { - unsigned long cur_delta, spare_cap, max_spare_cap = 0; + unsigned long cpu_cap, cpu_thermal_cap, util; + unsigned long cur_delta, max_spare_cap = 0; bool compute_prev_delta = false; unsigned long base_energy_pd; int max_spare_cap_cpu = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); - for_each_cpu_and(cpu, cpus, sched_domain_span(sd)) { + if (cpumask_empty(cpus)) + continue; + + /* Account thermal pressure for the energy estimation */ + cpu = cpumask_first(cpus); + cpu_thermal_cap = arch_scale_cpu_capacity(cpu); + cpu_thermal_cap -= arch_scale_thermal_pressure(cpu); + + eenv.cpu_cap = cpu_thermal_cap; + eenv.pd_cap = 0; + + for_each_cpu(cpu, cpus) { + eenv.pd_cap += cpu_thermal_cap; + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) continue; util = cpu_util_next(cpu, p, cpu); cpu_cap = capacity_of(cpu); - spare_cap = cpu_cap; - lsub_positive(&spare_cap, util); /* * Skip CPUs that cannot satisfy the capacity request. @@ -6871,15 +6941,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!fits_capacity(util, cpu_cap)) continue; + lsub_positive(&cpu_cap, util); + if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ compute_prev_delta = true; - } else if (spare_cap > max_spare_cap) { + } else if (cpu_cap > max_spare_cap) { /* * Find the CPU with the maximum spare capacity * in the performance domain. */ - max_spare_cap = spare_cap; + max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; } } @@ -6887,13 +6959,16 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (max_spare_cap_cpu < 0 && !compute_prev_delta) continue; + eenv_pd_busy_time(&eenv, cpus, p); /* Compute the 'base' energy of the pd, without @p */ - base_energy_pd = compute_energy(p, -1, cpus, pd); + base_energy_pd = compute_energy(&eenv, pd, cpus, p, -1); base_energy += base_energy_pd; /* Evaluate the energy impact of using prev_cpu. */ if (compute_prev_delta) { - prev_delta = compute_energy(p, prev_cpu, cpus, pd); + prev_delta = compute_energy(&eenv, pd, cpus, p, + prev_cpu); + /* CPU utilization has changed */ if (prev_delta < base_energy_pd) goto unlock; prev_delta -= base_energy_pd; @@ -6902,8 +6977,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >= 0) { - cur_delta = compute_energy(p, max_spare_cap_cpu, cpus, - pd); + cur_delta = compute_energy(&eenv, pd, cpus, p, + max_spare_cap_cpu); + /* CPU utilization has changed */ if (cur_delta < base_energy_pd) goto unlock; cur_delta -= base_energy_pd;