linux-stable/kernel/power/energy_model.c
Lukasz Luba 156eaacba3 PM: EM: Increase energy calculation precision
[ Upstream commit 7fcc17d0cb ]

The Energy Model (EM) provides useful information about device power in
each performance state to other subsystems like: Energy Aware Scheduler
(EAS). The energy calculation in EAS does arithmetic operation based on
the EM em_cpu_energy(). Current implementation of that function uses
em_perf_state::cost as a pre-computed cost coefficient equal to:
cost = power * max_frequency / frequency.
The 'power' is expressed in milli-Watts (or in abstract scale).

There are corner cases when the EAS energy calculation for two Performance
Domains (PDs) return the same value. The EAS compares these values to
choose smaller one. It might happen that this values are equal due to
rounding error. In such scenario, we need better resolution, e.g. 1000
times better. To provide this possibility increase the resolution in the
em_perf_state::cost for 64-bit architectures. The cost of increasing
resolution on 32-bit is pretty high (64-bit division) and is not justified
since there are no new 32bit big.LITTLE EAS systems expected which would
benefit from this higher resolution.

This patch allows to avoid the rounding to milli-Watt errors, which might
occur in EAS energy estimation for each PD. The rounding error is common
for small tasks which have small utilization value.

There are two places in the code where it makes a difference:
1. In the find_energy_efficient_cpu() where we are searching for
best_delta. We might suffer there when two PDs return the same result,
like in the example below.

Scenario:
Low utilized system e.g. ~200 sum_util for PD0 and ~220 for PD1. There
are quite a few small tasks ~10-15 util. These tasks would suffer for
the rounding error. These utilization values are typical when running games
on Android. One of our partners has reported 5..10mA less battery drain
when running with increased resolution.

Some details:
We have two PDs: PD0 (big) and PD1 (little)
Let's compare w/o patch set ('old') and w/ patch set ('new')
We are comparing energy w/ task and w/o task placed in the PDs

a) 'old' w/o patch set, PD0
task_util = 13
cost = 480
sum_util_w/o_task = 215
sum_util_w_task = 228
scale_cpu = 1024
energy_w/o_task = 480 * 215 / 1024 = 100.78 => 100
energy_w_task = 480 * 228 / 1024 = 106.87 => 106
energy_diff = 106 - 100 = 6
(this is equal to 'old' PD1's energy_diff in 'c)')

b) 'new' w/ patch set, PD0
task_util = 13
cost = 480 * 1000 = 480000
sum_util_w/o_task = 215
sum_util_w_task = 228
energy_w/o_task = 480000 * 215 / 1024 = 100781
energy_w_task = 480000 * 228 / 1024  = 106875
energy_diff = 106875 - 100781 = 6094
(this is not equal to 'new' PD1's energy_diff in 'd)')

c) 'old' w/o patch set, PD1
task_util = 13
cost = 160
sum_util_w/o_task = 283
sum_util_w_task = 293
scale_cpu = 355
energy_w/o_task = 160 * 283 / 355 = 127.55 => 127
energy_w_task = 160 * 296 / 355 = 133.41 => 133
energy_diff = 133 - 127 = 6
(this is equal to 'old' PD0's energy_diff in 'a)')

d) 'new' w/ patch set, PD1
task_util = 13
cost = 160 * 1000 = 160000
sum_util_w/o_task = 283
sum_util_w_task = 293
scale_cpu = 355
energy_w/o_task = 160000 * 283 / 355 = 127549
energy_w_task = 160000 * 296 / 355 =   133408
energy_diff = 133408 - 127549 = 5859
(this is not equal to 'new' PD0's energy_diff in 'b)')

2. Difference in the 6% energy margin filter at the end of
find_energy_efficient_cpu(). With this patch the margin comparison also
has better resolution, so it's possible to have better task placement
thanks to that.

Fixes: 27871f7a8a ("PM: Introduce an Energy Model management framework")
Reported-by: CCJ Yeh <CCj.Yeh@mediatek.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Lukasz Luba <lukasz.luba@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2021-09-15 09:47:33 +02:00

260 lines
6.6 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Energy Model of CPUs
*
* Copyright (c) 2018, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
*/
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/energy_model.h>
#include <linux/sched/topology.h>
#include <linux/slab.h>
/* Mapping of each CPU to the performance domain to which it belongs. */
static DEFINE_PER_CPU(struct em_perf_domain *, em_data);
/*
* Mutex serializing the registrations of performance domains and letting
* callbacks defined by drivers sleep.
*/
static DEFINE_MUTEX(em_pd_mutex);
#ifdef CONFIG_DEBUG_FS
static struct dentry *rootdir;
static void em_debug_create_cs(struct em_cap_state *cs, struct dentry *pd)
{
struct dentry *d;
char name[24];
snprintf(name, sizeof(name), "cs:%lu", cs->frequency);
/* Create per-cs directory */
d = debugfs_create_dir(name, pd);
debugfs_create_ulong("frequency", 0444, d, &cs->frequency);
debugfs_create_ulong("power", 0444, d, &cs->power);
debugfs_create_ulong("cost", 0444, d, &cs->cost);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
{
seq_printf(s, "%*pbl\n", cpumask_pr_args(to_cpumask(s->private)));
return 0;
}
DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
static void em_debug_create_pd(struct em_perf_domain *pd, int cpu)
{
struct dentry *d;
char name[8];
int i;
snprintf(name, sizeof(name), "pd%d", cpu);
/* Create the directory of the performance domain */
d = debugfs_create_dir(name, rootdir);
debugfs_create_file("cpus", 0444, d, pd->cpus, &em_debug_cpus_fops);
/* Create a sub-directory for each capacity state */
for (i = 0; i < pd->nr_cap_states; i++)
em_debug_create_cs(&pd->table[i], d);
}
static int __init em_debug_init(void)
{
/* Create /sys/kernel/debug/energy_model directory */
rootdir = debugfs_create_dir("energy_model", NULL);
return 0;
}
fs_initcall(em_debug_init);
#else /* CONFIG_DEBUG_FS */
static void em_debug_create_pd(struct em_perf_domain *pd, int cpu) {}
#endif
static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states,
struct em_data_callback *cb)
{
unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
unsigned long power, freq, prev_freq = 0;
int i, ret, cpu = cpumask_first(span);
struct em_cap_state *table;
struct em_perf_domain *pd;
u64 fmax;
if (!cb->active_power)
return NULL;
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
if (!pd)
return NULL;
table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
if (!table)
goto free_pd;
/* Build the list of capacity states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
/*
* active_power() is a driver callback which ceils 'freq' to
* lowest capacity state of 'cpu' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
ret = cb->active_power(&power, &freq, cpu);
if (ret) {
pr_err("pd%d: invalid cap. state: %d\n", cpu, ret);
goto free_cs_table;
}
/*
* We expect the driver callback to increase the frequency for
* higher capacity states.
*/
if (freq <= prev_freq) {
pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq);
goto free_cs_table;
}
/*
* The power returned by active_state() is expected to be
* positive, in milli-watts and to fit into 16 bits.
*/
if (!power || power > EM_CPU_MAX_POWER) {
pr_err("pd%d: invalid power: %lu\n", cpu, power);
goto free_cs_table;
}
table[i].power = power;
table[i].frequency = prev_freq = freq;
/*
* The hertz/watts efficiency ratio should decrease as the
* frequency grows on sane platforms. But this isn't always
* true in practice so warn the user if a higher OPP is more
* power efficient than a lower one.
*/
opp_eff = freq / power;
if (opp_eff >= prev_opp_eff)
pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n",
cpu, i, i - 1);
prev_opp_eff = opp_eff;
}
/* Compute the cost of each capacity_state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = 0; i < nr_states; i++) {
unsigned long power_res = em_scale_power(table[i].power);
table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
}
pd->table = table;
pd->nr_cap_states = nr_states;
cpumask_copy(to_cpumask(pd->cpus), span);
em_debug_create_pd(pd, cpu);
return pd;
free_cs_table:
kfree(table);
free_pd:
kfree(pd);
return NULL;
}
/**
* em_cpu_get() - Return the performance domain for a CPU
* @cpu : CPU to find the performance domain for
*
* Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't
* exist.
*/
struct em_perf_domain *em_cpu_get(int cpu)
{
return READ_ONCE(per_cpu(em_data, cpu));
}
EXPORT_SYMBOL_GPL(em_cpu_get);
/**
* em_register_perf_domain() - Register the Energy Model of a performance domain
* @span : Mask of CPUs in the performance domain
* @nr_states : Number of capacity states to register
* @cb : Callback functions providing the data of the Energy Model
*
* Create Energy Model tables for a performance domain using the callbacks
* defined in cb.
*
* If multiple clients register the same performance domain, all but the first
* registration will be ignored.
*
* Return 0 on success
*/
int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
struct em_data_callback *cb)
{
unsigned long cap, prev_cap = 0;
struct em_perf_domain *pd;
int cpu, ret = 0;
if (!span || !nr_states || !cb)
return -EINVAL;
/*
* Use a mutex to serialize the registration of performance domains and
* let the driver-defined callback functions sleep.
*/
mutex_lock(&em_pd_mutex);
for_each_cpu(cpu, span) {
/* Make sure we don't register again an existing domain. */
if (READ_ONCE(per_cpu(em_data, cpu))) {
ret = -EEXIST;
goto unlock;
}
/*
* All CPUs of a domain must have the same micro-architecture
* since they all share the same table.
*/
cap = arch_scale_cpu_capacity(cpu);
if (prev_cap && prev_cap != cap) {
pr_err("CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(span));
ret = -EINVAL;
goto unlock;
}
prev_cap = cap;
}
/* Create the performance domain and add it to the Energy Model. */
pd = em_create_pd(span, nr_states, cb);
if (!pd) {
ret = -EINVAL;
goto unlock;
}
for_each_cpu(cpu, span) {
/*
* The per-cpu array can be read concurrently from em_cpu_get().
* The barrier enforces the ordering needed to make sure readers
* can only access well formed em_perf_domain structs.
*/
smp_store_release(per_cpu_ptr(&em_data, cpu), pd);
}
pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span));
unlock:
mutex_unlock(&em_pd_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(em_register_perf_domain);