linux-stable/arch/x86/include/asm/mwait.h
Wyes Karny aebef63cf7 x86: Remove vendor checks from prefer_mwait_c1_over_halt
Remove vendor checks from prefer_mwait_c1_over_halt function. Restore
the decision tree to support MWAIT C1 as the default idle state based on
CPUID checks as done by Thomas Gleixner in
commit 09fd4b4ef5 ("x86: use cpuid to check MWAIT support for C1")

The decision tree is removed in
commit 69fb3676df ("x86 idle: remove mwait_idle() and "idle=mwait" cmdline param")

Prefer MWAIT when the following conditions are satisfied:
    1. CPUID_Fn00000001_ECX [Monitor] should be set
    2. CPUID_Fn00000005 should be supported
    3. If CPUID_Fn00000005_ECX [EMX] is set then there should be
       at least one C1 substate available, indicated by
       CPUID_Fn00000005_EDX [MWaitC1SubStates] bits.

Otherwise use HLT for default_idle function.

HPC customers who want to optimize for lower latency are known to
disable Global C-States in the BIOS. In fact, some vendors allow
choosing a BIOS 'performance' profile which explicitly disables
C-States.  In this scenario, the cpuidle driver will not be loaded and
the kernel will continue with the default idle state chosen at boot
time. On AMD systems currently the default idle state is HLT which has
a higher exit latency compared to MWAIT.

The reason for the choice of HLT over MWAIT on AMD systems is:

1. Families prior to 10h didn't support MWAIT
2. Families 10h-15h supported MWAIT, but not MWAIT C1. Hence it was
   preferable to use HLT as the default state on these systems.

However, AMD Family 17h onwards supports MWAIT as well as MWAIT C1. And
it is preferable to use MWAIT as the default idle state on these
systems, as it has lower exit latencies.

The below table represents the exit latency for HLT and MWAIT on AMD
Zen 3 system. Exit latency is measured by issuing a wakeup (IPI) to
other CPU and measuring how many clock cycles it took to wakeup.  Each
iteration measures 10K wakeups by pinning source and destination.

HLT:

25.0000th percentile  :      1900 ns
50.0000th percentile  :      2000 ns
75.0000th percentile  :      2300 ns
90.0000th percentile  :      2500 ns
95.0000th percentile  :      2600 ns
99.0000th percentile  :      2800 ns
99.5000th percentile  :      3000 ns
99.9000th percentile  :      3400 ns
99.9500th percentile  :      3600 ns
99.9900th percentile  :      5900 ns
  Min latency         :      1700 ns
  Max latency         :      5900 ns
Total Samples      9999

MWAIT:

25.0000th percentile  :      1400 ns
50.0000th percentile  :      1500 ns
75.0000th percentile  :      1700 ns
90.0000th percentile  :      1800 ns
95.0000th percentile  :      1900 ns
99.0000th percentile  :      2300 ns
99.5000th percentile  :      2500 ns
99.9000th percentile  :      3200 ns
99.9500th percentile  :      3500 ns
99.9900th percentile  :      4600 ns
  Min latency         :      1200 ns
  Max latency         :      4600 ns
Total Samples      9997

Improvement (99th percentile): 21.74%

Below is another result for context_switch2 micro-benchmark, which
brings out the impact of improved wakeup latency through increased
context-switches per second.

with HLT:
-------------------------------
50.0000th percentile  :  190184
75.0000th percentile  :  191032
90.0000th percentile  :  192314
95.0000th percentile  :  192520
99.0000th percentile  :  192844
MIN  :  190148
MAX  :  192852

with MWAIT:
-------------------------------
50.0000th percentile  :  277444
75.0000th percentile  :  278268
90.0000th percentile  :  278888
95.0000th percentile  :  279164
99.0000th percentile  :  280504
MIN  :  273278
MAX  :  281410

Improvement(99th percentile): ~ 45.46%

Signed-off-by: Wyes Karny <wyes.karny@amd.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Zhang Rui <rui.zhang@intel.com>
Link: https://ozlabs.org/~anton/junkcode/context_switch2.c
Link: https://lkml.kernel.org/r/0cc675d8fd1f55e41b510e10abf2e21b6e9803d5.1654538381.git-series.wyes.karny@amd.com
2022-06-08 13:00:19 -07:00

144 lines
4.4 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MWAIT_H
#define _ASM_X86_MWAIT_H
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <asm/cpufeature.h>
#include <asm/nospec-branch.h>
#define MWAIT_SUBSTATE_MASK 0xf
#define MWAIT_CSTATE_MASK 0xf
#define MWAIT_SUBSTATE_SIZE 4
#define MWAIT_HINT2CSTATE(hint) (((hint) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK)
#define MWAIT_HINT2SUBSTATE(hint) ((hint) & MWAIT_CSTATE_MASK)
#define MWAIT_C1_SUBSTATE_MASK 0xf0
#define CPUID_MWAIT_LEAF 5
#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
#define CPUID5_ECX_INTERRUPT_BREAK 0x2
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_WAIT_CYCLES UINT_MAX
#define MWAITX_DISABLE_CSTATES 0xf0
#define TPAUSE_C01_STATE 1
#define TPAUSE_C02_STATE 0
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitor %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xc8;"
:: "a" (eax), "c" (ecx), "d"(edx));
}
static inline void __monitorx(const void *eax, unsigned long ecx,
unsigned long edx)
{
/* "monitorx %eax, %ecx, %edx;" */
asm volatile(".byte 0x0f, 0x01, 0xfa;"
:: "a" (eax), "c" (ecx), "d"(edx));
}
static inline void __mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
/* "mwait %eax, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
/*
* MWAITX allows for a timer expiration to get the core out a wait state in
* addition to the default MWAIT exit condition of a store appearing at a
* monitored virtual address.
*
* Registers:
*
* MWAITX ECX[1]: enable timer if set
* MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
* frequency is the same as the TSC frequency.
*
* Below is a comparison between MWAIT and MWAITX on AMD processors:
*
* MWAIT MWAITX
* opcode 0f 01 c9 | 0f 01 fb
* ECX[0] value of RFLAGS.IF seen by instruction
* ECX[1] unused/#GP if set | enable timer if set
* ECX[31:2] unused/#GP if set
* EAX unused (reserve for hint)
* EBX[31:0] unused | max wait time (P0 clocks)
*
* MONITOR MONITORX
* opcode 0f 01 c8 | 0f 01 fa
* EAX (logical) address to monitor
* ECX #GP if not zero
*/
static inline void __mwaitx(unsigned long eax, unsigned long ebx,
unsigned long ecx)
{
/* No MDS buffer clear as this is AMD/HYGON only */
/* "mwaitx %eax, %ebx, %ecx;" */
asm volatile(".byte 0x0f, 0x01, 0xfb;"
:: "a" (eax), "b" (ebx), "c" (ecx));
}
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
{
mds_idle_clear_cpu_buffers();
/* "mwait %eax, %ecx;" */
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
:: "a" (eax), "c" (ecx));
}
/*
* This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
* which can obviate IPI to trigger checking of need_resched.
* We execute MONITOR against need_resched and enter optimized wait state
* through MWAIT. Whenever someone changes need_resched, we would be woken
* up from MWAIT (without an IPI).
*
* New with Core Duo processors, MWAIT can take some hints based on CPU
* capability.
*/
static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
{
if (static_cpu_has_bug(X86_BUG_MONITOR) || !current_set_polling_and_test()) {
if (static_cpu_has_bug(X86_BUG_CLFLUSH_MONITOR)) {
mb();
clflush((void *)&current_thread_info()->flags);
mb();
}
__monitor((void *)&current_thread_info()->flags, 0, 0);
if (!need_resched())
__mwait(eax, ecx);
}
current_clr_polling();
}
/*
* Caller can specify whether to enter C0.1 (low latency, less
* power saving) or C0.2 state (saves more power, but longer wakeup
* latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR
* which can force requests for C0.2 to be downgraded to C0.1.
*/
static inline void __tpause(u32 ecx, u32 edx, u32 eax)
{
/* "tpause %ecx, %edx, %eax;" */
#ifdef CONFIG_AS_TPAUSE
asm volatile("tpause %%ecx\n"
:
: "c"(ecx), "d"(edx), "a"(eax));
#else
asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n"
:
: "c"(ecx), "d"(edx), "a"(eax));
#endif
}
#endif /* _ASM_X86_MWAIT_H */