Merge branch 'pm-x86'

* pm-x86:
  x86: tsc: Rework time_cpufreq_notifier()
  admin-guide: pm: intel_epb: Add SPDX license tag and copyright notice
  PM / arch: x86: MSR_IA32_ENERGY_PERF_BIAS sysfs interface
  PM / arch: x86: Rework the MSR_IA32_ENERGY_PERF_BIAS handling
This commit is contained in:
Rafael J. Wysocki 2019-05-06 10:54:07 +02:00
commit 4566e2dd4a
10 changed files with 292 additions and 68 deletions

View file

@ -518,3 +518,21 @@ Description: Control Symetric Multi Threading (SMT)
If control status is "forceoff" or "notsupported" writes
are rejected.
What: /sys/devices/system/cpu/cpu#/power/energy_perf_bias
Date: March 2019
Contact: linux-pm@vger.kernel.org
Description: Intel Energy and Performance Bias Hint (EPB)
EPB for the given CPU in a sliding scale 0 - 15, where a value
of 0 corresponds to a hint preference for highest performance
and a value of 15 corresponds to the maximum energy savings.
In order to change the EPB value for the CPU, write either
a number in the 0 - 15 sliding scale above, or one of the
strings: "performance", "balance-performance", "normal",
"balance-power", "power" (that represent values reflected by
their meaning), to this attribute.
This attribute is present for all online CPUs supporting the
Intel EPB feature.

View file

@ -0,0 +1,41 @@
.. SPDX-License-Identifier: GPL-2.0
.. include:: <isonum.txt>
======================================
Intel Performance and Energy Bias Hint
======================================
:Copyright: |copy| 2019 Intel Corporation
:Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
.. kernel-doc:: arch/x86/kernel/cpu/intel_epb.c
:doc: overview
Intel Performance and Energy Bias Attribute in ``sysfs``
========================================================
The Intel Performance and Energy Bias Hint (EPB) value for a given (logical) CPU
can be checked or updated through a ``sysfs`` attribute (file) under
:file:`/sys/devices/system/cpu/cpu<N>/power/`, where the CPU number ``<N>``
is allocated at the system initialization time:
``energy_perf_bias``
Shows the current EPB value for the CPU in a sliding scale 0 - 15, where
a value of 0 corresponds to a hint preference for highest performance
and a value of 15 corresponds to the maximum energy savings.
In order to update the EPB value for the CPU, this attribute can be
written to, either with a number in the 0 - 15 sliding scale above, or
with one of the strings: "performance", "balance-performance", "normal",
"balance-power", "power" that represent values reflected by their
meaning.
This attribute is present for all online CPUs supporting the EPB
feature.
Note that while the EPB interface to the processor is defined at the logical CPU
level, the physical register backing it may be shared by multiple CPUs (for
example, SMT siblings or cores in one package). For this reason, updating the
EPB value for one CPU may cause the EPB values for other CPUs to change.

View file

@ -8,3 +8,4 @@ Working-State Power Management
cpuidle
cpufreq
intel_pstate
intel_epb

View file

@ -28,7 +28,7 @@ obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o

View file

@ -1864,23 +1864,6 @@ void cpu_init(void)
}
#endif
static void bsp_resume(void)
{
if (this_cpu->c_bsp_resume)
this_cpu->c_bsp_resume(&boot_cpu_data);
}
static struct syscore_ops cpu_syscore_ops = {
.resume = bsp_resume,
};
static int __init init_cpu_syscore(void)
{
register_syscore_ops(&cpu_syscore_ops);
return 0;
}
core_initcall(init_cpu_syscore);
/*
* The microcode loader calls this upon late microcode load to recheck features,
* only when microcode has been updated. Caller holds microcode_mutex and CPU

View file

@ -14,7 +14,6 @@ struct cpu_dev {
void (*c_init)(struct cpuinfo_x86 *);
void (*c_identify)(struct cpuinfo_x86 *);
void (*c_detect_tlb)(struct cpuinfo_x86 *);
void (*c_bsp_resume)(struct cpuinfo_x86 *);
int c_x86_vendor;
#ifdef CONFIG_X86_32
/* Optional vendor specific routine to obtain the cache size. */

View file

@ -596,36 +596,6 @@ static void detect_tme(struct cpuinfo_x86 *c)
c->x86_phys_bits -= keyid_bits;
}
static void init_intel_energy_perf(struct cpuinfo_x86 *c)
{
u64 epb;
/*
* Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
* (x86_energy_perf_policy(8) is available to change it at run-time.)
*/
if (!cpu_has(c, X86_FEATURE_EPB))
return;
rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
return;
pr_info_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
pr_info_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
}
static void intel_bsp_resume(struct cpuinfo_x86 *c)
{
/*
* MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
* so reinitialize it properly like during bootup:
*/
init_intel_energy_perf(c);
}
static void init_cpuid_fault(struct cpuinfo_x86 *c)
{
u64 msr;
@ -763,8 +733,6 @@ static void init_intel(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_TME))
detect_tme(c);
init_intel_energy_perf(c);
init_intel_misc_features(c);
}
@ -1023,9 +991,7 @@ static const struct cpu_dev intel_cpu_dev = {
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
.c_init = init_intel,
.c_bsp_resume = intel_bsp_resume,
.c_x86_vendor = X86_VENDOR_INTEL,
};
cpu_dev_register(intel_cpu_dev);

View file

@ -0,0 +1,216 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Intel Performance and Energy Bias Hint support.
*
* Copyright (C) 2019 Intel Corporation
*
* Author:
* Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
#include <linux/cpuhotplug.h>
#include <linux/cpu.h>
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/syscore_ops.h>
#include <linux/pm.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
/**
* DOC: overview
*
* The Performance and Energy Bias Hint (EPB) allows software to specify its
* preference with respect to the power-performance tradeoffs present in the
* processor. Generally, the EPB is expected to be set by user space (directly
* via sysfs or with the help of the x86_energy_perf_policy tool), but there are
* two reasons for the kernel to update it.
*
* First, there are systems where the platform firmware resets the EPB during
* system-wide transitions from sleep states back into the working state
* effectively causing the previous EPB updates by user space to be lost.
* Thus the kernel needs to save the current EPB values for all CPUs during
* system-wide transitions to sleep states and restore them on the way back to
* the working state. That can be achieved by saving EPB for secondary CPUs
* when they are taken offline during transitions into system sleep states and
* for the boot CPU in a syscore suspend operation, so that it can be restored
* for the boot CPU in a syscore resume operation and for the other CPUs when
* they are brought back online. However, CPUs that are already offline when
* a system-wide PM transition is started are not taken offline again, but their
* EPB values may still be reset by the platform firmware during the transition,
* so in fact it is necessary to save the EPB of any CPU taken offline and to
* restore it when the given CPU goes back online at all times.
*
* Second, on many systems the initial EPB value coming from the platform
* firmware is 0 ('performance') and at least on some of them that is because
* the platform firmware does not initialize EPB at all with the assumption that
* the OS will do that anyway. That sometimes is problematic, as it may cause
* the system battery to drain too fast, for example, so it is better to adjust
* it on CPU bring-up and if the initial EPB value for a given CPU is 0, the
* kernel changes it to 6 ('normal').
*/
static DEFINE_PER_CPU(u8, saved_epb);
#define EPB_MASK 0x0fULL
#define EPB_SAVED 0x10ULL
#define MAX_EPB EPB_MASK
static int intel_epb_save(void)
{
u64 epb;
rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
/*
* Ensure that saved_epb will always be nonzero after this write even if
* the EPB value read from the MSR is 0.
*/
this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED);
return 0;
}
static void intel_epb_restore(void)
{
u64 val = this_cpu_read(saved_epb);
u64 epb;
rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
if (val) {
val &= EPB_MASK;
} else {
/*
* Because intel_epb_save() has not run for the current CPU yet,
* it is going online for the first time, so if its EPB value is
* 0 ('performance') at this point, assume that it has not been
* initialized by the platform firmware and set it to 6
* ('normal').
*/
val = epb & EPB_MASK;
if (val == ENERGY_PERF_BIAS_PERFORMANCE) {
val = ENERGY_PERF_BIAS_NORMAL;
pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
}
}
wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val);
}
static struct syscore_ops intel_epb_syscore_ops = {
.suspend = intel_epb_save,
.resume = intel_epb_restore,
};
static const char * const energy_perf_strings[] = {
"performance",
"balance-performance",
"normal",
"balance-power",
"power"
};
static const u8 energ_perf_values[] = {
ENERGY_PERF_BIAS_PERFORMANCE,
ENERGY_PERF_BIAS_BALANCE_PERFORMANCE,
ENERGY_PERF_BIAS_NORMAL,
ENERGY_PERF_BIAS_BALANCE_POWERSAVE,
ENERGY_PERF_BIAS_POWERSAVE
};
static ssize_t energy_perf_bias_show(struct device *dev,
struct device_attribute *attr,
char *buf)
{
unsigned int cpu = dev->id;
u64 epb;
int ret;
ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret < 0)
return ret;
return sprintf(buf, "%llu\n", epb);
}
static ssize_t energy_perf_bias_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
unsigned int cpu = dev->id;
u64 epb, val;
int ret;
ret = __sysfs_match_string(energy_perf_strings,
ARRAY_SIZE(energy_perf_strings), buf);
if (ret >= 0)
val = energ_perf_values[ret];
else if (kstrtou64(buf, 0, &val) || val > MAX_EPB)
return -EINVAL;
ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
if (ret < 0)
return ret;
ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS,
(epb & ~EPB_MASK) | val);
if (ret < 0)
return ret;
return count;
}
static DEVICE_ATTR_RW(energy_perf_bias);
static struct attribute *intel_epb_attrs[] = {
&dev_attr_energy_perf_bias.attr,
NULL
};
static const struct attribute_group intel_epb_attr_group = {
.name = power_group_name,
.attrs = intel_epb_attrs
};
static int intel_epb_online(unsigned int cpu)
{
struct device *cpu_dev = get_cpu_device(cpu);
intel_epb_restore();
if (!cpuhp_tasks_frozen)
sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group);
return 0;
}
static int intel_epb_offline(unsigned int cpu)
{
struct device *cpu_dev = get_cpu_device(cpu);
if (!cpuhp_tasks_frozen)
sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group);
intel_epb_save();
return 0;
}
static __init int intel_epb_init(void)
{
int ret;
if (!boot_cpu_has(X86_FEATURE_EPB))
return -ENODEV;
ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE,
"x86/intel/epb:online", intel_epb_online,
intel_epb_offline);
if (ret < 0)
goto err_out_online;
register_syscore_ops(&intel_epb_syscore_ops);
return 0;
err_out_online:
cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE);
return ret;
}
subsys_initcall(intel_epb_init);

View file

@ -185,8 +185,7 @@ static void __init cyc2ns_init_boot_cpu(void)
/*
* Secondary CPUs do not run through tsc_init(), so set up
* all the scale factors for all CPUs, assuming the same
* speed as the bootup CPU. (cpufreq notifiers will fix this
* up if their speed diverges)
* speed as the bootup CPU.
*/
static void __init cyc2ns_init_secondary_cpus(void)
{
@ -937,12 +936,12 @@ void tsc_restore_sched_clock_state(void)
}
#ifdef CONFIG_CPU_FREQ
/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
/*
* Frequency scaling support. Adjust the TSC based timer when the CPU frequency
* changes.
*
* RED-PEN: On SMP we assume all CPUs run with the same frequency. It's
* not that important because current Opteron setups do not support
* scaling on SMP anyroads.
* NOTE: On SMP the situation is not fixable in general, so simply mark the TSC
* as unstable and give up in those cases.
*
* Should fix up last_tsc too. Currently gettimeofday in the
* first tick after the change will be slightly wrong.
@ -956,22 +955,22 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct cpufreq_freqs *freq = data;
unsigned long *lpj;
lpj = &boot_cpu_data.loops_per_jiffy;
#ifdef CONFIG_SMP
if (!(freq->flags & CPUFREQ_CONST_LOOPS))
lpj = &cpu_data(freq->cpu).loops_per_jiffy;
#endif
if (num_online_cpus() > 1) {
mark_tsc_unstable("cpufreq changes on SMP");
return 0;
}
if (!ref_freq) {
ref_freq = freq->old;
loops_per_jiffy_ref = *lpj;
loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
tsc_khz_ref = tsc_khz;
}
if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
*lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
(val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) {
boot_cpu_data.loops_per_jiffy =
cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
if (!(freq->flags & CPUFREQ_CONST_LOOPS))

View file

@ -147,6 +147,7 @@ enum cpuhp_state {
CPUHP_AP_X86_VDSO_VMA_ONLINE,
CPUHP_AP_IRQ_AFFINITY_ONLINE,
CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
CPUHP_AP_X86_INTEL_EPB_ONLINE,
CPUHP_AP_PERF_ONLINE,
CPUHP_AP_PERF_X86_ONLINE,
CPUHP_AP_PERF_X86_UNCORE_ONLINE,