linux-stable/kernel/sched/cpufreq_schedutil.c
Linus Torvalds b4ec805464 Power management updates for 5.11-rc1
- Use local_clock() instead of jiffies in the cpufreq statistics to
    improve accuracy (Viresh Kumar).
 
  - Fix up OPP usage in the cpufreq-dt and qcom-cpufreq-nvmem cpufreq
    drivers (Viresh Kumar).
 
  - Clean up the cpufreq core, the intel_pstate driver and the
    schedutil cpufreq governor (Rafael Wysocki).
 
  - Fix up error code paths in the sti-cpufreq and mediatek cpufreq
    drivers (Yangtao Li, Qinglang Miao).
 
  - Fix cpufreq_online() to return error codes instead of success (0)
    in all cases when it fails (Wang ShaoBo).
 
  - Add mt8167 support to the mediatek cpufreq driver and blacklist
    mt8516 in the cpufreq-dt-platdev driver (Fabien Parent).
 
  - Modify the tegra194 cpufreq driver to always return values from
    the frequency table as the current frequency and clean up that
    driver (Sumit Gupta, Jon Hunter).
 
  - Modify the arm_scmi cpufreq driver to allow it to discover the
    power scale present in the performance protocol and provide this
    information to the Energy Model (Lukasz Luba).
 
  - Add missing MODULE_DEVICE_TABLE to several cpufreq drivers (Pali
    Rohár).
 
  - Clean up the CPPC cpufreq driver (Ionela Voinescu).
 
  - Fix NVMEM_IMX_OCOTP dependency in the imx cpufreq driver (Arnd
    Bergmann).
 
  - Rework the poling interval selection for the polling state in
    cpuidle (Mel Gorman).
 
  - Enable suspend-to-idle for PSCI OSI mode in the PSCI cpuidle
    driver (Ulf Hansson).
 
  - Modify the OPP framework to support empty (node-less) OPP tables
    in DT for passing dependency information (Nicola Mazzucato).
 
  - Fix potential lockdep issue in the OPP core and clean up the OPP
    core (Viresh Kumar).
 
  - Modify dev_pm_opp_put_regulators() to accept a NULL argument and
    update its users accordingly (Viresh Kumar).
 
  - Add frequency changes tracepoint to devfreq (Matthias Kaehlcke).
 
  - Add support for governor feature flags to devfreq, make devfreq
    sysfs file permissions depend on the governor and clean up the
    devfreq core (Chanwoo Choi).
 
  - Clean up the tegra20 devfreq driver and deprecate it to allow
    another driver based on EMC_STAT to be used instead of it (Dmitry
    Osipenko).
 
  - Add interconnect support to the tegra30 devfreq driver, allow it
    to take the interconnect and OPP information from DT and clean it
    up ((Dmitry Osipenko).
 
  - Add interconnect support to the exynos-bus devfreq driver along
    with interconnect properties documentation (Sylwester Nawrocki).
 
  - Add suport for AMD Fam17h and Fam19h processors to the RAPL power
    capping driver (Victor Ding, Kim Phillips).
 
  - Fix handling of overly long constraint names in the powercap
    framework (Lukasz Luba).
 
  - Fix the wakeup configuration handling for bridges in the ACPI
    device power management core (Rafael Wysocki).
 
  - Add support for using an abstract scale for power units in the
    Energy Model (EM) and document it (Lukasz Luba).
 
  - Add em_cpu_energy() micro-optimization to the EM (Pavankumar
    Kondeti).
 
  - Modify the generic power domains (genpd) framwework to support
    suspend-to-idle (Ulf Hansson).
 
  - Fix creation of debugfs nodes in genpd (Thierry Strudel).
 
  - Clean up genpd (Lina Iyer).
 
  - Clean up the core system-wide suspend code and make it print
    driver flags for devices with debug enabled (Alex Shi, Patrice
    Chotard, Chen Yu).
 
  - Modify the ACPI system reboot code to make it prepare for system
    power off to avoid confusing the platform firmware (Kai-Heng Feng).
 
  - Update the pm-graph (multiple changes, mostly usability-related)
    and cpupower (online and offline CPU information support) PM
    utilities (Todd Brandt, Brahadambal Srinivasan).
 -----BEGIN PGP SIGNATURE-----
 
 iQJGBAABCAAwFiEE4fcc61cGeeHD/fCwgsRv/nhiVHEFAl/Y8mcSHHJqd0Byand5
 c29ja2kubmV0AAoJEILEb/54YlRxjY4QAKsNFJeEtjGCxq7MxQIML3QLAsdJM9of
 9kkY9skMEw4v1TRmyy7sW9jZW2pLSRcLJwWRKWu4143qUS3YUp2DQ0lqX4WyXoWu
 BhnkhkMUl6iCeBO8CWnt8zsTuqSa20A13sL9LyqN1+7OZKHD8StbT4hKjBncdNNN
 4aDj+1uAPyOgj2iCUZuHQ8DtpBvOLjgTh367vbhbufjeJ//8/9+R7s4Xzrj7wtmv
 JlE0LDgvge9QeGTpjhxQJzn0q2/H5fg9jbmjPXUfbHJNuyKhrqnmjGyrN5m256JI
 8DqGqQtJpmFp7Ihrur3uKTk3gWO05YwJ1FdeEooAKEjEMObm5xuYhKVRoDhmlJAu
 G6ui+OAUvNR0FffJtbzvWe/pLovLGOEOHdvTrZxUF8Abo6br3untTm8rKTi1fhaF
 wWndSMw0apGsPzCx5T+bE7AbJz2QHFpLhaVAutenuCzNI8xoMlxNKEzsaVz/+FqL
 Pq/PdFaM4vNlMbv7hkb/fujkCs/v3EcX2ihzvt7I2o8dBS0D1X8A4mnuWJmiGslw
 1ftbJ6M9XacwkPBTHPgeXxJh2C1yxxe5VQ9Z5fWWi7sPOUeJnUwxKaluv+coFndQ
 sO6JxsPQ4hQihg8yOxLEkL6Wn68sZlmp+u2Oj+TPFAsAGANIA8rJlBPo1ppJWvdQ
 j1OCIc/qzwpH
 =BVdX
 -----END PGP SIGNATURE-----

Merge tag 'pm-5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm

Pull power management updates from Rafael Wysocki:
 "These update cpufreq (core and drivers), cpuidle (polling state
  implementation and the PSCI driver), the OPP (operating performance
  points) framework, devfreq (core and drivers), the power capping RAPL
  (Running Average Power Limit) driver, the Energy Model support, the
  generic power domains (genpd) framework, the ACPI device power
  management, the core system-wide suspend code and power management
  utilities.

  Specifics:

   - Use local_clock() instead of jiffies in the cpufreq statistics to
     improve accuracy (Viresh Kumar).

   - Fix up OPP usage in the cpufreq-dt and qcom-cpufreq-nvmem cpufreq
     drivers (Viresh Kumar).

   - Clean up the cpufreq core, the intel_pstate driver and the
     schedutil cpufreq governor (Rafael Wysocki).

   - Fix up error code paths in the sti-cpufreq and mediatek cpufreq
     drivers (Yangtao Li, Qinglang Miao).

   - Fix cpufreq_online() to return error codes instead of success (0)
     in all cases when it fails (Wang ShaoBo).

   - Add mt8167 support to the mediatek cpufreq driver and blacklist
     mt8516 in the cpufreq-dt-platdev driver (Fabien Parent).

   - Modify the tegra194 cpufreq driver to always return values from the
     frequency table as the current frequency and clean up that driver
     (Sumit Gupta, Jon Hunter).

   - Modify the arm_scmi cpufreq driver to allow it to discover the
     power scale present in the performance protocol and provide this
     information to the Energy Model (Lukasz Luba).

   - Add missing MODULE_DEVICE_TABLE to several cpufreq drivers (Pali
     Rohár).

   - Clean up the CPPC cpufreq driver (Ionela Voinescu).

   - Fix NVMEM_IMX_OCOTP dependency in the imx cpufreq driver (Arnd
     Bergmann).

   - Rework the poling interval selection for the polling state in
     cpuidle (Mel Gorman).

   - Enable suspend-to-idle for PSCI OSI mode in the PSCI cpuidle driver
     (Ulf Hansson).

   - Modify the OPP framework to support empty (node-less) OPP tables in
     DT for passing dependency information (Nicola Mazzucato).

   - Fix potential lockdep issue in the OPP core and clean up the OPP
     core (Viresh Kumar).

   - Modify dev_pm_opp_put_regulators() to accept a NULL argument and
     update its users accordingly (Viresh Kumar).

   - Add frequency changes tracepoint to devfreq (Matthias Kaehlcke).

   - Add support for governor feature flags to devfreq, make devfreq
     sysfs file permissions depend on the governor and clean up the
     devfreq core (Chanwoo Choi).

   - Clean up the tegra20 devfreq driver and deprecate it to allow
     another driver based on EMC_STAT to be used instead of it (Dmitry
     Osipenko).

   - Add interconnect support to the tegra30 devfreq driver, allow it to
     take the interconnect and OPP information from DT and clean it up
     (Dmitry Osipenko).

   - Add interconnect support to the exynos-bus devfreq driver along
     with interconnect properties documentation (Sylwester Nawrocki).

   - Add suport for AMD Fam17h and Fam19h processors to the RAPL power
     capping driver (Victor Ding, Kim Phillips).

   - Fix handling of overly long constraint names in the powercap
     framework (Lukasz Luba).

   - Fix the wakeup configuration handling for bridges in the ACPI
     device power management core (Rafael Wysocki).

   - Add support for using an abstract scale for power units in the
     Energy Model (EM) and document it (Lukasz Luba).

   - Add em_cpu_energy() micro-optimization to the EM (Pavankumar
     Kondeti).

   - Modify the generic power domains (genpd) framwework to support
     suspend-to-idle (Ulf Hansson).

   - Fix creation of debugfs nodes in genpd (Thierry Strudel).

   - Clean up genpd (Lina Iyer).

   - Clean up the core system-wide suspend code and make it print driver
     flags for devices with debug enabled (Alex Shi, Patrice Chotard,
     Chen Yu).

   - Modify the ACPI system reboot code to make it prepare for system
     power off to avoid confusing the platform firmware (Kai-Heng Feng).

   - Update the pm-graph (multiple changes, mostly usability-related)
     and cpupower (online and offline CPU information support) PM
     utilities (Todd Brandt, Brahadambal Srinivasan)"

* tag 'pm-5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: (86 commits)
  cpufreq: Fix cpufreq_online() return value on errors
  cpufreq: Fix up several kerneldoc comments
  cpufreq: stats: Use local_clock() instead of jiffies
  cpufreq: schedutil: Simplify sugov_update_next_freq()
  cpufreq: intel_pstate: Simplify intel_cpufreq_update_pstate()
  PM: domains: create debugfs nodes when adding power domains
  opp: of: Allow empty opp-table with opp-shared
  dt-bindings: opp: Allow empty OPP tables
  media: venus: dev_pm_opp_put_*() accepts NULL argument
  drm/panfrost: dev_pm_opp_put_*() accepts NULL argument
  drm/lima: dev_pm_opp_put_*() accepts NULL argument
  PM / devfreq: exynos: dev_pm_opp_put_*() accepts NULL argument
  cpufreq: qcom-cpufreq-nvmem: dev_pm_opp_put_*() accepts NULL argument
  cpufreq: dt: dev_pm_opp_put_regulators() accepts NULL argument
  opp: Allow dev_pm_opp_put_*() APIs to accept NULL opp_table
  opp: Don't create an OPP table from dev_pm_opp_get_opp_table()
  cpufreq: dt: Don't (ab)use dev_pm_opp_get_opp_table() to create OPP table
  opp: Reduce the size of critical section in _opp_kref_release()
  PM / EM: Micro optimization in em_cpu_energy
  cpufreq: arm_scmi: Discover the power scale in performance protocol
  ...
2020-12-15 16:30:31 -08:00

923 lines
26 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* CPUFreq governor based on scheduler-provided CPU utilization data.
*
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "sched.h"
#include <linux/sched/cpufreq.h>
#include <trace/events/power.h>
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
struct sugov_tunables {
struct gov_attr_set attr_set;
unsigned int rate_limit_us;
};
struct sugov_policy {
struct cpufreq_policy *policy;
struct sugov_tunables *tunables;
struct list_head tunables_hook;
raw_spinlock_t update_lock; /* For shared policies */
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used: */
struct irq_work irq_work;
struct kthread_work work;
struct mutex work_lock;
struct kthread_worker worker;
struct task_struct *thread;
bool work_in_progress;
bool limits_changed;
bool need_freq_update;
};
struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
unsigned int cpu;
bool iowait_boost_pending;
unsigned int iowait_boost;
u64 last_update;
unsigned long bw_dl;
unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
unsigned long saved_idle_calls;
#endif
};
static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
/************************ Governor internals ***********************/
static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
s64 delta_ns;
/*
* Since cpufreq_update_util() is called with rq->lock held for
* the @target_cpu, our per-CPU data is fully serialized.
*
* However, drivers cannot in general deal with cross-CPU
* requests, so while get_next_freq() will work, our
* sugov_update_commit() call may not for the fast switching platforms.
*
* Hence stop here for remote requests if they aren't supported
* by the hardware, as calculating the frequency is pointless if
* we cannot in fact act on it.
*
* This is needed on the slow switching platforms too to prevent CPUs
* going offline from leaving stale IRQ work items behind.
*/
if (!cpufreq_this_cpu_can_update(sg_policy->policy))
return false;
if (unlikely(sg_policy->limits_changed)) {
sg_policy->limits_changed = false;
sg_policy->need_freq_update = true;
return true;
}
delta_ns = time - sg_policy->last_freq_update_time;
return delta_ns >= sg_policy->freq_update_delay_ns;
}
static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
if (sg_policy->need_freq_update)
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
else if (sg_policy->next_freq == next_freq)
return false;
sg_policy->next_freq = next_freq;
sg_policy->last_freq_update_time = time;
return true;
}
static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
if (sugov_update_next_freq(sg_policy, time, next_freq))
cpufreq_driver_fast_switch(sg_policy->policy, next_freq);
}
static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
if (!sugov_update_next_freq(sg_policy, time, next_freq))
return;
if (!sg_policy->work_in_progress) {
sg_policy->work_in_progress = true;
irq_work_queue(&sg_policy->irq_work);
}
}
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
* @sg_policy: schedutil policy object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
* If the utilization is frequency-invariant, choose the new frequency to be
* proportional to it, that is
*
* next_freq = C * max_freq * util / max
*
* Otherwise, approximate the would-be frequency-invariant utilization by
* util_raw * (curr_freq / max_freq) which leads to
*
* next_freq = C * curr_freq * util_raw / max
*
* Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
*
* The lowest driver-supported frequency which is equal or greater than the raw
* next_freq (as calculated above) is returned, subject to policy min/max and
* cpufreq driver limitations.
*/
static unsigned int get_next_freq(struct sugov_policy *sg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
freq = map_util_freq(util, freq, max);
if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update)
return sg_policy->next_freq;
sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
/*
* This function computes an effective utilization for the given CPU, to be
* used for frequency selection given the linear relation: f = u * f_max.
*
* The scheduler tracks the following metrics:
*
* cpu_util_{cfs,rt,dl,irq}()
* cpu_bw_dl()
*
* Where the cfs,rt and dl util numbers are tracked with the same metric and
* synchronized windows and are thus directly comparable.
*
* The cfs,rt,dl utilization are the running times measured with rq->clock_task
* which excludes things like IRQ and steal-time. These latter are then accrued
* in the irq utilization.
*
* The DL bandwidth number otoh is not a measured metric but a value computed
* based on the task model parameters and gives the minimal utilization
* required to meet deadlines.
*/
unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
unsigned long max, enum schedutil_type type,
struct task_struct *p)
{
unsigned long dl_util, util, irq;
struct rq *rq = cpu_rq(cpu);
if (!uclamp_is_used() &&
type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
return max;
}
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
* because of inaccuracies in how we track these -- see
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
if (unlikely(irq >= max))
return max;
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
*
* CFS and RT utilization can be boosted or capped, depending on
* utilization clamp constraints requested by currently RUNNABLE
* tasks.
* When there are no CFS RUNNABLE tasks, clamps are released and
* frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs + cpu_util_rt(rq);
if (type == FREQUENCY_UTIL)
util = uclamp_rq_util_with(rq, util, p);
dl_util = cpu_util_dl(rq);
/*
* For frequency selection we do not make cpu_util_dl() a permanent part
* of this sum because we want to use cpu_bw_dl() later on, but we need
* to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
* that we select f_max when there is no idle time.
*
* NOTE: numerical errors or stop class might cause us to not quite hit
* saturation when we should -- something for later.
*/
if (util + dl_util >= max)
return max;
/*
* OTOH, for energy computation we need the estimated running time, so
* include util_dl and ignore dl_bw.
*/
if (type == ENERGY_UTIL)
util += dl_util;
/*
* There is still idle time; further improve the number by using the
* irq metric. Because IRQ/steal time is hidden from the task clock we
* need to scale the task numbers:
*
* max - irq
* U' = irq + --------- * U
* max
*/
util = scale_irq_capacity(util, irq, max);
util += irq;
/*
* Bandwidth required by DEADLINE must always be granted while, for
* FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
* to gracefully reduce the frequency when no tasks show up for longer
* periods of time.
*
* Ideally we would like to set bw_dl as min/guaranteed freq and util +
* bw_dl as requested freq. However, cpufreq is not yet ready for such
* an interface. So, we only do the latter for now.
*/
if (type == FREQUENCY_UTIL)
util += cpu_bw_dl(rq);
return min(max, util);
}
static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util = cpu_util_cfs(rq);
unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->max = max;
sg_cpu->bw_dl = cpu_bw_dl(rq);
return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
}
/**
* sugov_iowait_reset() - Reset the IO boost status of a CPU.
* @sg_cpu: the sugov data for the CPU to boost
* @time: the update time from the caller
* @set_iowait_boost: true if an IO boost has been requested
*
* The IO wait boost of a task is disabled after a tick since the last update
* of a CPU. If a new IO wait boost is requested after more then a tick, then
* we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
* efficiency by ignoring sporadic wakeups from IO.
*/
static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
bool set_iowait_boost)
{
s64 delta_ns = time - sg_cpu->last_update;
/* Reset boost only if a tick has elapsed since last request */
if (delta_ns <= TICK_NSEC)
return false;
sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
sg_cpu->iowait_boost_pending = set_iowait_boost;
return true;
}
/**
* sugov_iowait_boost() - Updates the IO boost status of a CPU.
* @sg_cpu: the sugov data for the CPU to boost
* @time: the update time from the caller
* @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
*
* Each time a task wakes up after an IO operation, the CPU utilization can be
* boosted to a certain utilization which doubles at each "frequent and
* successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
* of the maximum OPP.
*
* To keep doubling, an IO boost has to be requested at least once per tick,
* otherwise we restart from the utilization of the minimum OPP.
*/
static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
unsigned int flags)
{
bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;
/* Reset boost if the CPU appears to have been idle enough */
if (sg_cpu->iowait_boost &&
sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
return;
/* Boost only tasks waking up after IO */
if (!set_iowait_boost)
return;
/* Ensure boost doubles only one time at each request */
if (sg_cpu->iowait_boost_pending)
return;
sg_cpu->iowait_boost_pending = true;
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
sg_cpu->iowait_boost =
min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
return;
}
/* First wakeup after IO: start with minimum boost */
sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
}
/**
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
* @util: the utilization to (eventually) boost
* @max: the maximum value the utilization can be boosted to
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
* The IO boost value is increased each time a task wakes up from IO, in
* sugov_iowait_apply(), and it's instead decreased by this function,
* each time an increase has not been requested (!iowait_boost_pending).
*
* A CPU which also appears to have been idle for at least one tick has also
* its IO boost utilization reset.
*
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
unsigned long util, unsigned long max)
{
unsigned long boost;
/* No boost currently required */
if (!sg_cpu->iowait_boost)
return util;
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
return util;
if (!sg_cpu->iowait_boost_pending) {
/*
* No boost pending; reduce the boost value.
*/
sg_cpu->iowait_boost >>= 1;
if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) {
sg_cpu->iowait_boost = 0;
return util;
}
}
sg_cpu->iowait_boost_pending = false;
/*
* @util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
return max(boost, util);
}
#ifdef CONFIG_NO_HZ_COMMON
static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
{
unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
bool ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
return ret;
}
#else
static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */
/*
* Make sugov_should_update_freq() ignore the rate limit when DL
* has increased the utilization.
*/
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
{
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
sg_policy->limits_changed = true;
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long util, max;
unsigned int next_f;
unsigned int cached_freq = sg_policy->cached_raw_freq;
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (!sugov_should_update_freq(sg_policy, time))
return;
util = sugov_get_util(sg_cpu);
max = sg_cpu->max;
util = sugov_iowait_apply(sg_cpu, time, util, max);
next_f = get_next_freq(sg_policy, util, max);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
/* Restore cached freq as next_freq has changed */
sg_policy->cached_raw_freq = cached_freq;
}
/*
* This code runs under rq->lock for the target CPU, so it won't run
* concurrently on two different CPUs for the same target and it is not
* necessary to acquire the lock in the fast switch case.
*/
if (sg_policy->policy->fast_switch_enabled) {
sugov_fast_switch(sg_policy, time, next_f);
} else {
raw_spin_lock(&sg_policy->update_lock);
sugov_deferred_update(sg_policy, time, next_f);
raw_spin_unlock(&sg_policy->update_lock);
}
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned long util = 0, max = 1;
unsigned int j;
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
unsigned long j_util, j_max;
j_util = sugov_get_util(j_sg_cpu);
j_max = j_sg_cpu->max;
j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max);
if (j_util * max > j_max * util) {
util = j_util;
max = j_max;
}
}
return get_next_freq(sg_policy, util, max);
}
static void
sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int next_f;
raw_spin_lock(&sg_policy->update_lock);
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
ignore_dl_rate_limit(sg_cpu, sg_policy);
if (sugov_should_update_freq(sg_policy, time)) {
next_f = sugov_next_freq_shared(sg_cpu, time);
if (sg_policy->policy->fast_switch_enabled)
sugov_fast_switch(sg_policy, time, next_f);
else
sugov_deferred_update(sg_policy, time, next_f);
}
raw_spin_unlock(&sg_policy->update_lock);
}
static void sugov_work(struct kthread_work *work)
{
struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
unsigned int freq;
unsigned long flags;
/*
* Hold sg_policy->update_lock shortly to handle the case where:
* incase sg_policy->next_freq is read here, and then updated by
* sugov_deferred_update() just before work_in_progress is set to false
* here, we may miss queueing the new update.
*
* Note: If a work was queued after the update_lock is released,
* sugov_work() will just be called again by kthread_work code; and the
* request will be proceed before the sugov thread sleeps.
*/
raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
freq = sg_policy->next_freq;
sg_policy->work_in_progress = false;
raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);
mutex_lock(&sg_policy->work_lock);
__cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L);
mutex_unlock(&sg_policy->work_lock);
}
static void sugov_irq_work(struct irq_work *irq_work)
{
struct sugov_policy *sg_policy;
sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
kthread_queue_work(&sg_policy->worker, &sg_policy->work);
}
/************************** sysfs interface ************************/
static struct sugov_tunables *global_tunables;
static DEFINE_MUTEX(global_tunables_lock);
static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
{
return container_of(attr_set, struct sugov_tunables, attr_set);
}
static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
return sprintf(buf, "%u\n", tunables->rate_limit_us);
}
static ssize_t
rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
{
struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
struct sugov_policy *sg_policy;
unsigned int rate_limit_us;
if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;
tunables->rate_limit_us = rate_limit_us;
list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
return count;
}
static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
static struct attribute *sugov_attrs[] = {
&rate_limit_us.attr,
NULL
};
ATTRIBUTE_GROUPS(sugov);
static struct kobj_type sugov_tunables_ktype = {
.default_groups = sugov_groups,
.sysfs_ops = &governor_sysfs_ops,
};
/********************** cpufreq governor interface *********************/
struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
if (!sg_policy)
return NULL;
sg_policy->policy = policy;
raw_spin_lock_init(&sg_policy->update_lock);
return sg_policy;
}
static void sugov_policy_free(struct sugov_policy *sg_policy)
{
kfree(sg_policy);
}
static int sugov_kthread_create(struct sugov_policy *sg_policy)
{
struct task_struct *thread;
struct sched_attr attr = {
.size = sizeof(struct sched_attr),
.sched_policy = SCHED_DEADLINE,
.sched_flags = SCHED_FLAG_SUGOV,
.sched_nice = 0,
.sched_priority = 0,
/*
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
.sched_runtime = 1000000,
.sched_deadline = 10000000,
.sched_period = 10000000,
};
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
/* kthread only required for slow path */
if (policy->fast_switch_enabled)
return 0;
kthread_init_work(&sg_policy->work, sugov_work);
kthread_init_worker(&sg_policy->worker);
thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
"sugov:%d",
cpumask_first(policy->related_cpus));
if (IS_ERR(thread)) {
pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
return PTR_ERR(thread);
}
ret = sched_setattr_nocheck(thread, &attr);
if (ret) {
kthread_stop(thread);
pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
return ret;
}
sg_policy->thread = thread;
kthread_bind_mask(thread, policy->related_cpus);
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
wake_up_process(thread);
return 0;
}
static void sugov_kthread_stop(struct sugov_policy *sg_policy)
{
/* kthread only required for slow path */
if (sg_policy->policy->fast_switch_enabled)
return;
kthread_flush_worker(&sg_policy->worker);
kthread_stop(sg_policy->thread);
mutex_destroy(&sg_policy->work_lock);
}
static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
{
struct sugov_tunables *tunables;
tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
if (tunables) {
gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
if (!have_governor_per_policy())
global_tunables = tunables;
}
return tunables;
}
static void sugov_tunables_free(struct sugov_tunables *tunables)
{
if (!have_governor_per_policy())
global_tunables = NULL;
kfree(tunables);
}
static int sugov_init(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy;
struct sugov_tunables *tunables;
int ret = 0;
/* State should be equivalent to EXIT */
if (policy->governor_data)
return -EBUSY;
cpufreq_enable_fast_switch(policy);
sg_policy = sugov_policy_alloc(policy);
if (!sg_policy) {
ret = -ENOMEM;
goto disable_fast_switch;
}
ret = sugov_kthread_create(sg_policy);
if (ret)
goto free_sg_policy;
mutex_lock(&global_tunables_lock);
if (global_tunables) {
if (WARN_ON(have_governor_per_policy())) {
ret = -EINVAL;
goto stop_kthread;
}
policy->governor_data = sg_policy;
sg_policy->tunables = global_tunables;
gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
goto out;
}
tunables = sugov_tunables_alloc(sg_policy);
if (!tunables) {
ret = -ENOMEM;
goto stop_kthread;
}
tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
policy->governor_data = sg_policy;
sg_policy->tunables = tunables;
ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
get_governor_parent_kobj(policy), "%s",
schedutil_gov.name);
if (ret)
goto fail;
out:
mutex_unlock(&global_tunables_lock);
return 0;
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
sugov_tunables_free(tunables);
stop_kthread:
sugov_kthread_stop(sg_policy);
mutex_unlock(&global_tunables_lock);
free_sg_policy:
sugov_policy_free(sg_policy);
disable_fast_switch:
cpufreq_disable_fast_switch(policy);
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
static void sugov_exit(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
struct sugov_tunables *tunables = sg_policy->tunables;
unsigned int count;
mutex_lock(&global_tunables_lock);
count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count)
sugov_tunables_free(tunables);
mutex_unlock(&global_tunables_lock);
sugov_kthread_stop(sg_policy);
sugov_policy_free(sg_policy);
cpufreq_disable_fast_switch(policy);
}
static int sugov_start(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
sg_policy->last_freq_update_time = 0;
sg_policy->next_freq = 0;
sg_policy->work_in_progress = false;
sg_policy->limits_changed = false;
sg_policy->cached_raw_freq = 0;
sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
}
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
sugov_update_single);
}
return 0;
}
static void sugov_stop(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
for_each_cpu(cpu, policy->cpus)
cpufreq_remove_update_util_hook(cpu);
synchronize_rcu();
if (!policy->fast_switch_enabled) {
irq_work_sync(&sg_policy->irq_work);
kthread_cancel_work_sync(&sg_policy->work);
}
}
static void sugov_limits(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
if (!policy->fast_switch_enabled) {
mutex_lock(&sg_policy->work_lock);
cpufreq_policy_apply_limits(policy);
mutex_unlock(&sg_policy->work_lock);
}
sg_policy->limits_changed = true;
}
struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
.flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
.stop = sugov_stop,
.limits = sugov_limits,
};
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
struct cpufreq_governor *cpufreq_default_governor(void)
{
return &schedutil_gov;
}
#endif
cpufreq_governor_init(schedutil_gov);
#ifdef CONFIG_ENERGY_MODEL
static void rebuild_sd_workfn(struct work_struct *work)
{
rebuild_sched_domains_energy();
}
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
/*
* EAS shouldn't be attempted without sugov, so rebuild the sched_domains
* on governor changes to make sure the scheduler knows about it.
*/
void sched_cpufreq_governor_change(struct cpufreq_policy *policy,
struct cpufreq_governor *old_gov)
{
if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) {
/*
* When called from the cpufreq_register_driver() path, the
* cpu_hotplug_lock is already held, so use a work item to
* avoid nested locking in rebuild_sched_domains().
*/
schedule_work(&rebuild_sd_work);
}
}
#endif