X86 timer specific updates:

- Add TPAUSE based delay which allows the CPU to enter an optimized power
    state while waiting for the delay to pass. The delay is based on TSC
    cycles.
 
  - Add tsc_early_khz command line parameter to workaround the problem that
    overclocked CPUs can report the wrong frequency via CPUID.16h which
    causes the refined calibration to fail because the delta to the initial
    frequency value is too big. With the parameter users can provide an
    halfways accurate initial value.
 -----BEGIN PGP SIGNATURE-----
 
 iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAl7XvMITHHRnbHhAbGlu
 dXRyb25peC5kZQAKCRCmGPVMDXSYoQ59EACWOU2E+S/b+AqKoZRAJWbTASmu2jEU
 4AukhjO3A0y+G3EqnCtvQbUbKkthScSmrDJs2Dt8CTO6q3Fqv/f5JgoubgSx9Hbj
 pF1hvueOvRBpinzGEJbDbv+HbkoCYr10DZ5dZ8uz120pSnlfSNNpgZ6hJkOFaUHu
 nwVEJpkg2x3ZsiJrgyOfdorwbxO5dCNY9YVL3jyVXUi5QfP3lYrr3/Nz6daIRtRn
 Q9tj48N4Bk4ASgmg4rSdXd6OKeZ3Oz1nerol5vFvBeaOc8PVcKSu5sSqMIHHUV2M
 RJq8T4nW5Y4pkYjpdYP7Pr/3HYbSNW6eU+MycfnJOzYYTIQfFWkG2wHDNuOg/v+A
 GC/grS6wNBj/+tZlvWTwLPf44h7V+sowzYPHBWounT/5drFZ+xsm8+Je4s2NtNih
 rbG/4oOQ2jn05PNBCCOyLuP33efQ3ub2UHPCoUxckMiX2eqI+iWpdllZLSiSADZY
 jlbXgTQ/Fa3nGKVYVDi1GYbx1rBr/HbsbgvGV4D802s7inmev0azrbgc/CECrnvO
 rEa501Y1xzxZ7Zet0QvLK/7aKP532pCmgZiBSmcnS73FBbnssvNJiHlAeq4NHtN2
 TsaGYLy0iPSj7siXEaeysUKRjUTNNrgRvtWfo35GDjWahgXhixIvVVpwxnWws5cj
 aNR5FwxnI03V2A==
 =199V
 -----END PGP SIGNATURE-----

Merge tag 'x86-timers-2020-06-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 timer updates from Thomas Gleixner:
 "X86 timer specific updates:

   - Add TPAUSE based delay which allows the CPU to enter an optimized
     power state while waiting for the delay to pass. The delay is based
     on TSC cycles.

   - Add tsc_early_khz command line parameter to workaround the problem
     that overclocked CPUs can report the wrong frequency via CPUID.16h
     which causes the refined calibration to fail because the delta to
     the initial frequency value is too big. With the parameter users
     can provide an halfways accurate initial value"

* tag 'x86-timers-2020-06-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/tsc: Add tsc_early_khz command line parameter
  x86/delay: Introduce TPAUSE delay
  x86/delay: Refactor delay_mwaitx() for TPAUSE support
  x86/delay: Preparatory code cleanup
This commit is contained in:
Linus Torvalds 2020-06-03 10:18:09 -07:00
commit f6aee505c7
7 changed files with 129 additions and 40 deletions

View File

@ -5093,6 +5093,12 @@
interruptions from clocksource watchdog are not
acceptable).
tsc_early_khz= [X86] Skip early TSC calibration and use the given
value instead. Useful when the early TSC frequency discovery
procedure is not reliable, such as on overclocked systems
with CPUID.16h support and partial CPUID.15h support.
Format: <unsigned int>
tsx= [X86] Control Transactional Synchronization
Extensions (TSX) feature in Intel processors that
support TSX control.

View File

@ -15,3 +15,7 @@ config AS_SHA256_NI
def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1)
help
Supported by binutils >= 2.24 and LLVM integrated assembler
config AS_TPAUSE
def_bool $(as-instr,tpause %ecx)
help
Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7

View File

@ -3,8 +3,10 @@
#define _ASM_X86_DELAY_H
#include <asm-generic/delay.h>
#include <linux/init.h>
void use_tsc_delay(void);
void __init use_tsc_delay(void);
void __init use_tpause_delay(void);
void use_mwaitx_delay(void);
#endif /* _ASM_X86_DELAY_H */

View File

@ -20,8 +20,10 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_LOOPS ((u32)-1)
#define MWAITX_MAX_WAIT_CYCLES UINT_MAX
#define MWAITX_DISABLE_CSTATES 0xf0
#define TPAUSE_C01_STATE 1
#define TPAUSE_C02_STATE 0
u32 get_umwait_control_msr(void);
@ -122,4 +124,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
current_clr_polling();
}
/*
* Caller can specify whether to enter C0.1 (low latency, less
* power saving) or C0.2 state (saves more power, but longer wakeup
* latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR
* which can force requests for C0.2 to be downgraded to C0.1.
*/
static inline void __tpause(u32 ecx, u32 edx, u32 eax)
{
/* "tpause %ecx, %edx, %eax;" */
#ifdef CONFIG_AS_TPAUSE
asm volatile("tpause %%ecx\n"
:
: "c"(ecx), "d"(edx), "a"(eax));
#else
asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n"
:
: "c"(ecx), "d"(edx), "a"(eax));
#endif
}
#endif /* _ASM_X86_MWAIT_H */

View File

@ -103,6 +103,9 @@ static __init void x86_late_time_init(void)
*/
x86_init.irqs.intr_mode_init();
tsc_init();
if (static_cpu_has(X86_FEATURE_WAITPKG))
use_tpause_delay();
}
/*

View File

@ -41,6 +41,7 @@ EXPORT_SYMBOL(tsc_khz);
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;
static DEFINE_STATIC_KEY_FALSE(__use_tsc);
@ -59,6 +60,12 @@ struct cyc2ns {
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
static int __init tsc_early_khz_setup(char *buf)
{
return kstrtouint(buf, 0, &tsc_early_khz);
}
early_param("tsc_early_khz", tsc_early_khz_setup);
__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
int seq, idx;
@ -1412,7 +1419,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early)
if (early) {
cpu_khz = x86_platform.calibrate_cpu();
tsc_khz = x86_platform.calibrate_tsc();
if (tsc_early_khz)
tsc_khz = tsc_early_khz;
else
tsc_khz = x86_platform.calibrate_tsc();
} else {
/* We should not be here with non-native cpu calibration */
WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);

View File

@ -27,9 +27,20 @@
# include <asm/smp.h>
#endif
static void delay_loop(u64 __loops);
/*
* Calibration and selection of the delay mechanism happens only once
* during boot.
*/
static void (*delay_fn)(u64) __ro_after_init = delay_loop;
static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init;
/* simple loop based delay: */
static void delay_loop(unsigned long loops)
static void delay_loop(u64 __loops)
{
unsigned long loops = (unsigned long)__loops;
asm volatile(
" test %0,%0 \n"
" jz 3f \n"
@ -49,9 +60,9 @@ static void delay_loop(unsigned long loops)
}
/* TSC based delay: */
static void delay_tsc(unsigned long __loops)
static void delay_tsc(u64 cycles)
{
u64 bclock, now, loops = __loops;
u64 bclock, now;
int cpu;
preempt_disable();
@ -59,7 +70,7 @@ static void delay_tsc(unsigned long __loops)
bclock = rdtsc_ordered();
for (;;) {
now = rdtsc_ordered();
if ((now - bclock) >= loops)
if ((now - bclock) >= cycles)
break;
/* Allow RT tasks to run */
@ -77,7 +88,7 @@ static void delay_tsc(unsigned long __loops)
* counter for this CPU.
*/
if (unlikely(cpu != smp_processor_id())) {
loops -= (now - bclock);
cycles -= (now - bclock);
cpu = smp_processor_id();
bclock = rdtsc_ordered();
}
@ -86,65 +97,96 @@ static void delay_tsc(unsigned long __loops)
}
/*
* On some AMD platforms, MWAITX has a configurable 32-bit timer, that
* counts with TSC frequency. The input value is the loop of the
* counter, it will exit when the timer expires.
* On Intel the TPAUSE instruction waits until any of:
* 1) the TSC counter exceeds the value provided in EDX:EAX
* 2) global timeout in IA32_UMWAIT_CONTROL is exceeded
* 3) an external interrupt occurs
*/
static void delay_mwaitx(unsigned long __loops)
static void delay_halt_tpause(u64 start, u64 cycles)
{
u64 start, end, delay, loops = __loops;
u64 until = start + cycles;
u32 eax, edx;
eax = lower_32_bits(until);
edx = upper_32_bits(until);
/*
* Hard code the deeper (C0.2) sleep state because exit latency is
* small compared to the "microseconds" that usleep() will delay.
*/
__tpause(TPAUSE_C02_STATE, edx, eax);
}
/*
* On some AMD platforms, MWAITX has a configurable 32-bit timer, that
* counts with TSC frequency. The input value is the number of TSC cycles
* to wait. MWAITX will also exit when the timer expires.
*/
static void delay_halt_mwaitx(u64 unused, u64 cycles)
{
u64 delay;
delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
/*
* Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu
* variable as the monitor target.
*/
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
* enter any deep C-state and we use it here in delay() to minimize
* wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
}
/*
* Call a vendor specific function to delay for a given amount of time. Because
* these functions may return earlier than requested, check for actual elapsed
* time and call again until done.
*/
static void delay_halt(u64 __cycles)
{
u64 start, end, cycles = __cycles;
/*
* Timer value of 0 causes MWAITX to wait indefinitely, unless there
* is a store on the memory monitored by MONITORX.
*/
if (loops == 0)
if (!cycles)
return;
start = rdtsc_ordered();
for (;;) {
delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
/*
* Use cpu_tss_rw as a cacheline-aligned, seldomly
* accessed per-cpu variable as the monitor target.
*/
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
* AMD, like Intel's MWAIT version, supports the EAX hint and
* EAX=0xf0 means, do not enter any deep C-state and we use it
* here in delay() to minimize wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
delay_halt_fn(start, cycles);
end = rdtsc_ordered();
if (loops <= end - start)
if (cycles <= end - start)
break;
loops -= end - start;
cycles -= end - start;
start = end;
}
}
/*
* Since we calibrate only once at boot, this
* function should be set once at boot and not changed
*/
static void (*delay_fn)(unsigned long) = delay_loop;
void use_tsc_delay(void)
void __init use_tsc_delay(void)
{
if (delay_fn == delay_loop)
delay_fn = delay_tsc;
}
void __init use_tpause_delay(void)
{
delay_halt_fn = delay_halt_tpause;
delay_fn = delay_halt;
}
void use_mwaitx_delay(void)
{
delay_fn = delay_mwaitx;
delay_halt_fn = delay_halt_mwaitx;
delay_fn = delay_halt;
}
int read_current_timer(unsigned long *timer_val)