powerpc/64s: Reimplement book3s idle code in C

Reimplement Book3S idle code in C, moving POWER7/8/9 implementation
speific HV idle code to the powernv platform code.

Book3S assembly stubs are kept in common code and used only to save
the stack frame and non-volatile GPRs before executing architected
idle instructions, and restoring the stack and reloading GPRs then
returning to C after waking from idle.

The complex logic dealing with threads and subcores, locking, SPRs,
HMIs, timebase resync, etc., is all done in C which makes it more
maintainable.

This is not a strict translation to C code, there are some
significant differences:

- Idle wakeup no longer uses the ->cpu_restore call to reinit SPRs,
  but saves and restores them itself.

- The optimisation where EC=ESL=0 idle modes did not have to save GPRs
  or change MSR is restored, because it's now simple to do. ESL=1
  sleeps that do not lose GPRs can use this optimization too.

- KVM secondary entry and cede is now more of a call/return style
  rather than branchy. nap_state_lost is not required because KVM
  always returns via NVGPR restoring path.

- KVM secondary wakeup from offline sequence is moved entirely into
  the offline wakeup, which avoids a hwsync in the normal idle wakeup
  path.

Performance measured with context switch ping-pong on different
threads or cores, is possibly improved a small amount, 1-3% depending
on stop state and core vs thread test for shallow states. Deep states
it's in the noise compared with other latencies.

KVM improvements:

- Idle sleepers now always return to caller rather than branch out
  to KVM first.

- This allows optimisations like very fast return to caller when no
  state has been lost.

- KVM no longer requires nap_state_lost because it controls NVGPR
  save/restore itself on the way in and out.

- The heavy idle wakeup KVM request check can be moved out of the
  normal host idle code and into the not-performance-critical offline
  code.

- KVM nap code now returns from where it is called, which makes the
  flow a bit easier to follow.

Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Squash the KVM changes in]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
Nicholas Piggin 2019-04-13 00:30:52 +10:00 committed by Michael Ellerman
parent c1fe190c06
commit 10d91611f4
12 changed files with 977 additions and 1226 deletions

View file

@ -27,10 +27,11 @@
* the THREAD_WINKLE_BITS are set, which indicate which threads have not
* yet woken from the winkle state.
*/
#define PNV_CORE_IDLE_LOCK_BIT 0x10000000
#define NR_PNV_CORE_IDLE_LOCK_BIT 28
#define PNV_CORE_IDLE_LOCK_BIT (1ULL << NR_PNV_CORE_IDLE_LOCK_BIT)
#define PNV_CORE_IDLE_WINKLE_COUNT_SHIFT 16
#define PNV_CORE_IDLE_WINKLE_COUNT 0x00010000
#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT 0x00080000
#define PNV_CORE_IDLE_WINKLE_COUNT_BITS 0x000F0000
#define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT 8
#define PNV_CORE_IDLE_THREAD_WINKLE_BITS 0x0000FF00
@ -68,16 +69,6 @@
#define ERR_DEEP_STATE_ESL_MISMATCH -2
#ifndef __ASSEMBLY__
/* Additional SPRs that need to be saved/restored during stop */
struct stop_sprs {
u64 pid;
u64 ldbar;
u64 fscr;
u64 hfscr;
u64 mmcr1;
u64 mmcr2;
u64 mmcra;
};
#define PNV_IDLE_NAME_LEN 16
struct pnv_idle_states_t {
@ -92,10 +83,6 @@ struct pnv_idle_states_t {
extern struct pnv_idle_states_t *pnv_idle_states;
extern int nr_pnv_idle_states;
extern u32 pnv_fastsleep_workaround_at_entry[];
extern u32 pnv_fastsleep_workaround_at_exit[];
extern u64 pnv_first_deep_stop_state;
unsigned long pnv_cpu_offline(unsigned int cpu);
int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);

View file

@ -173,7 +173,6 @@ struct paca_struct {
u8 irq_happened; /* irq happened while soft-disabled */
u8 io_sync; /* writel() needs spin_unlock sync */
u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */
u8 nap_state_lost; /* NV GPR values lost in power7_idle */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
u8 pmcregs_in_use; /* pseries puts this in lppaca */
#endif
@ -183,23 +182,28 @@ struct paca_struct {
#endif
#ifdef CONFIG_PPC_POWERNV
/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
u32 *core_idle_state_ptr;
u8 thread_idle_state; /* PNV_THREAD_RUNNING/NAP/SLEEP */
/* Mask to indicate thread id in core */
u8 thread_mask;
/* Mask to denote subcore sibling threads */
u8 subcore_sibling_mask;
/* Flag to request this thread not to stop */
atomic_t dont_stop;
/* The PSSCR value that the kernel requested before going to stop */
u64 requested_psscr;
/* PowerNV idle fields */
/* PNV_CORE_IDLE_* bits, all siblings work on thread 0 paca */
unsigned long idle_state;
union {
/* P7/P8 specific fields */
struct {
/* PNV_THREAD_RUNNING/NAP/SLEEP */
u8 thread_idle_state;
/* Mask to denote subcore sibling threads */
u8 subcore_sibling_mask;
};
/*
* Save area for additional SPRs that need to be
* saved/restored during cpuidle stop.
*/
struct stop_sprs stop_sprs;
/* P9 specific fields */
struct {
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/* The PSSCR value that the kernel requested before going to stop */
u64 requested_psscr;
/* Flag to request this thread not to stop */
atomic_t dont_stop;
#endif
};
};
#endif
#ifdef CONFIG_PPC_BOOK3S_64

View file

@ -411,14 +411,17 @@ static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
}
#endif
/* asm stubs */
extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
extern unsigned long cpuidle_disable;
enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
extern int powersave_nap; /* set if nap mode can be used in idle loop */
extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/
extern void power7_idle_type(unsigned long type);
extern unsigned long power9_idle_stop(unsigned long psscr_val);
extern unsigned long power9_offline_stop(unsigned long psscr_val);
extern void power9_idle_type(unsigned long stop_psscr_val,
unsigned long stop_psscr_mask);

View file

@ -168,6 +168,7 @@
#define PSSCR_ESL 0x00200000 /* Enable State Loss */
#define PSSCR_SD 0x00400000 /* Status Disable */
#define PSSCR_PLS 0xf000000000000000 /* Power-saving Level Status */
#define PSSCR_PLS_SHIFT 60
#define PSSCR_GUEST_VIS 0xf0000000000003ffUL /* Guest-visible PSSCR fields */
#define PSSCR_FAKE_SUSPEND 0x00000400 /* Fake-suspend bit (P9 DD2.2) */
#define PSSCR_FAKE_SUSPEND_LG 10 /* Fake-suspend bit position */
@ -758,10 +759,9 @@
#define SRR1_WAKERESET 0x00100000 /* System reset */
#define SRR1_WAKEHDBELL 0x000c0000 /* Hypervisor doorbell on P8 */
#define SRR1_WAKESTATE 0x00030000 /* Powersave exit mask [46:47] */
#define SRR1_WS_DEEPEST 0x00030000 /* Some resources not maintained,
* may not be recoverable */
#define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */
#define SRR1_WS_DEEP 0x00010000 /* All resources maintained */
#define SRR1_WS_HVLOSS 0x00030000 /* HV resources not maintained */
#define SRR1_WS_GPRLOSS 0x00020000 /* GPRs not maintained */
#define SRR1_WS_NOLOSS 0x00010000 /* All resources maintained */
#define SRR1_PROGTM 0x00200000 /* TM Bad Thing */
#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */
#define SRR1_PROGILL 0x00080000 /* Illegal instruction */

View file

@ -268,7 +268,6 @@ int main(void)
OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
OFFSET(ACCOUNT_SYSTEM_TIME, paca_struct, accounting.stime);
OFFSET(PACA_TRAP_SAVE, paca_struct, trap_save);
OFFSET(PACA_NAPSTATELOST, paca_struct, nap_state_lost);
OFFSET(PACA_SPRG_VDSO, paca_struct, sprg_vdso);
#else /* CONFIG_PPC64 */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
@ -766,23 +765,6 @@ int main(void)
OFFSET(VCPU_TIMING_LAST_ENTER_TBL, kvm_vcpu, arch.timing_last_enter.tv32.tbl);
#endif
#ifdef CONFIG_PPC_POWERNV
OFFSET(PACA_CORE_IDLE_STATE_PTR, paca_struct, core_idle_state_ptr);
OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f)
STOP_SPR(STOP_PID, pid);
STOP_SPR(STOP_LDBAR, ldbar);
STOP_SPR(STOP_FSCR, fscr);
STOP_SPR(STOP_HFSCR, hfscr);
STOP_SPR(STOP_MMCR1, mmcr1);
STOP_SPR(STOP_MMCR2, mmcr2);
STOP_SPR(STOP_MMCRA, mmcra);
#endif
DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE);

View file

@ -120,7 +120,9 @@ EXC_VIRT_NONE(0x4000, 0x100)
mfspr r10,SPRN_SRR1 ; \
rlwinm. r10,r10,47-31,30,31 ; \
beq- 1f ; \
cmpwi cr3,r10,2 ; \
cmpwi cr1,r10,2 ; \
mfspr r3,SPRN_SRR1 ; \
bltlr cr1 ; /* no state loss, return to idle caller */ \
BRANCH_TO_C000(r10, system_reset_idle_common) ; \
1: \
KVMTEST_PR(n) ; \
@ -144,8 +146,11 @@ TRAMP_KVM(PACA_EXNMI, 0x100)
#ifdef CONFIG_PPC_P7_NAP
EXC_COMMON_BEGIN(system_reset_idle_common)
mfspr r12,SPRN_SRR1
b pnv_powersave_wakeup
/*
* This must be a direct branch (without linker branch stub) because
* we can not use TOC at this point as r2 may not be restored yet.
*/
b idle_return_gpr_loss
#endif
/*
@ -427,17 +432,17 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
* Then decrement MCE nesting after finishing with the stack.
*/
ld r3,_MSR(r1)
ld r4,_LINK(r1)
lhz r11,PACA_IN_MCE(r13)
subi r11,r11,1
sth r11,PACA_IN_MCE(r13)
/* Turn off the RI bit because SRR1 is used by idle wakeup code. */
/* Recoverability could be improved by reducing the use of SRR1. */
li r11,0
mtmsrd r11,1
b pnv_powersave_wakeup_mce
mtlr r4
rlwinm r10,r3,47-31,30,31
cmpwi cr1,r10,2
bltlr cr1 /* no state loss, return to idle caller */
b idle_return_gpr_loss
#endif
/*
* Handle machine check early in real mode. We come here with

File diff suppressed because it is too large Load diff

View file

@ -401,8 +401,8 @@ void __init check_for_initrd(void)
#ifdef CONFIG_SMP
int threads_per_core, threads_per_subcore, threads_shift;
cpumask_t threads_core_mask;
int threads_per_core, threads_per_subcore, threads_shift __read_mostly;
cpumask_t threads_core_mask __read_mostly;
EXPORT_SYMBOL_GPL(threads_per_core);
EXPORT_SYMBOL_GPL(threads_per_subcore);
EXPORT_SYMBOL_GPL(threads_shift);

View file

@ -35,6 +35,7 @@
#include <asm/thread_info.h>
#include <asm/asm-compat.h>
#include <asm/feature-fixups.h>
#include <asm/cpuidle.h>
/* Sign-extend HDEC if not on POWER9 */
#define EXTEND_HDEC(reg) \
@ -45,6 +46,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* Values in HSTATE_NAPPING(r13) */
#define NAPPING_CEDE 1
#define NAPPING_NOVCPU 2
#define NAPPING_UNSPLIT 3
/* Stack frame offsets for kvmppc_hv_entry */
#define SFS 208
@ -290,17 +292,19 @@ kvm_novcpu_exit:
b kvmhv_switch_to_host
/*
* We come in here when wakened from nap mode.
* Relocation is off and most register values are lost.
* r13 points to the PACA.
* We come in here when wakened from Linux offline idle code.
* Relocation is off
* r3 contains the SRR1 wakeup value, SRR1 is trashed.
*/
.globl kvm_start_guest
kvm_start_guest:
/* Set runlatch bit the minute you wake up from nap */
mfspr r0, SPRN_CTRLF
ori r0, r0, 1
mtspr SPRN_CTRLT, r0
_GLOBAL(idle_kvm_start_guest)
ld r4,PACAEMERGSP(r13)
mfcr r5
mflr r0
std r1,0(r4)
std r5,8(r4)
std r0,16(r4)
subi r1,r4,STACK_FRAME_OVERHEAD
SAVE_NVGPRS(r1)
/*
* Could avoid this and pass it through in r3. For now,
@ -308,27 +312,23 @@ kvm_start_guest:
*/
mtspr SPRN_SRR1,r3
ld r2,PACATOC(r13)
li r0,0
stb r0,PACA_FTRACE_ENABLED(r13)
li r0,KVM_HWTHREAD_IN_KVM
stb r0,HSTATE_HWTHREAD_STATE(r13)
/* NV GPR values from power7_idle() will no longer be valid */
li r0,1
stb r0,PACA_NAPSTATELOST(r13)
/* were we napping due to cede? */
/* kvm cede / napping does not come through here */
lbz r0,HSTATE_NAPPING(r13)
cmpwi r0,NAPPING_CEDE
beq kvm_end_cede
cmpwi r0,NAPPING_NOVCPU
beq kvm_novcpu_wakeup
twnei r0,0
ld r1,PACAEMERGSP(r13)
subi r1,r1,STACK_FRAME_OVERHEAD
b 1f
kvm_unsplit_wakeup:
li r0, 0
stb r0, HSTATE_NAPPING(r13)
1:
/*
* We weren't napping due to cede, so this must be a secondary
@ -437,19 +437,25 @@ kvm_no_guest:
lbz r3, HSTATE_HWTHREAD_REQ(r13)
cmpwi r3, 0
bne 54f
/*
* We jump to pnv_wakeup_loss, which will return to the caller
* of power7_nap in the powernv cpu offline loop. The value we
* put in r3 becomes the return value for power7_nap. pnv_wakeup_loss
* requires SRR1 in r12.
*/
/*
* Jump to idle_return_gpr_loss, which returns to the
* idle_kvm_start_guest caller.
*/
li r3, LPCR_PECE0
mfspr r4, SPRN_LPCR
rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
mtspr SPRN_LPCR, r4
li r3, 0
mfspr r12,SPRN_SRR1
b pnv_wakeup_loss
/* set up r3 for return */
mfspr r3,SPRN_SRR1
REST_NVGPRS(r1)
addi r1, r1, STACK_FRAME_OVERHEAD
ld r0, 16(r1)
ld r5, 8(r1)
ld r1, 0(r1)
mtlr r0
mtcr r5
blr
53: HMT_LOW
ld r5, HSTATE_KVM_VCORE(r13)
@ -534,6 +540,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
lbz r0, KVM_SPLIT_DO_NAP(r3)
cmpwi r0, 0
beq 57f
li r3, NAPPING_UNSPLIT
stb r3, HSTATE_NAPPING(r13)
li r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
mfspr r5, SPRN_LPCR
rlwimi r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
@ -2657,6 +2665,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */
/* Go back to host stack */
ld r1, HSTATE_HOST_R1(r13)
/*
* Take a nap until a decrementer or external or doobell interrupt
* occurs, with PECE1 and PECE0 set in LPCR.
@ -2685,26 +2696,42 @@ BEGIN_FTR_SECTION
* requested level = 0 (just stop dispatching)
*/
lis r3, (PSSCR_EC | PSSCR_ESL)@h
mtspr SPRN_PSSCR, r3
/* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
li r4, LPCR_PECE_HVEE@higher
sldi r4, r4, 32
or r5, r5, r4
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
FTR_SECTION_ELSE
li r3, PNV_THREAD_NAP
ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mtspr SPRN_LPCR,r5
isync
li r0, 0
std r0, HSTATE_SCRATCH0(r13)
ptesync
ld r0, HSTATE_SCRATCH0(r13)
1: cmpd r0, r0
bne 1b
BEGIN_FTR_SECTION
nap
bl isa300_idle_stop_mayloss
FTR_SECTION_ELSE
PPC_STOP
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
b .
bl isa206_idle_insn_mayloss
ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mfspr r0, SPRN_CTRLF
ori r0, r0, 1
mtspr SPRN_CTRLT, r0
mtspr SPRN_SRR1, r3
li r0, 0
stb r0, PACA_FTRACE_ENABLED(r13)
li r0, KVM_HWTHREAD_IN_KVM
stb r0, HSTATE_HWTHREAD_STATE(r13)
lbz r0, HSTATE_NAPPING(r13)
cmpwi r0, NAPPING_CEDE
beq kvm_end_cede
cmpwi r0, NAPPING_NOVCPU
beq kvm_novcpu_wakeup
cmpwi r0, NAPPING_UNSPLIT
beq kvm_unsplit_wakeup
twi 31,0,0 /* Nap state must not be zero */
33: mr r4, r3
li r3, 0
@ -2712,12 +2739,11 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
b 34f
kvm_end_cede:
/* Woken by external or decrementer interrupt */
/* get vcpu pointer */
ld r4, HSTATE_KVM_VCPU(r13)
/* Woken by external or decrementer interrupt */
ld r1, HSTATE_HOST_R1(r13)
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r4, VCPU_TB_RMINTR
bl kvmhv_accumulate_time

File diff suppressed because it is too large Load diff

View file

@ -183,7 +183,7 @@ static void unsplit_core(void)
cpu = smp_processor_id();
if (cpu_thread_in_core(cpu) != 0) {
while (mfspr(SPRN_HID0) & mask)
power7_idle_insn(PNV_THREAD_NAP);
power7_idle_type(PNV_THREAD_NAP);
per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
return;

View file

@ -2431,7 +2431,6 @@ static void dump_one_paca(int cpu)
DUMP(p, irq_happened, "%#-*x");
DUMP(p, io_sync, "%#-*x");
DUMP(p, irq_work_pending, "%#-*x");
DUMP(p, nap_state_lost, "%#-*x");
DUMP(p, sprg_vdso, "%#-*llx");
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@ -2439,19 +2438,16 @@ static void dump_one_paca(int cpu)
#endif
#ifdef CONFIG_PPC_POWERNV
DUMP(p, core_idle_state_ptr, "%-*px");
DUMP(p, thread_idle_state, "%#-*x");
DUMP(p, thread_mask, "%#-*x");
DUMP(p, subcore_sibling_mask, "%#-*x");
DUMP(p, requested_psscr, "%#-*llx");
DUMP(p, stop_sprs.pid, "%#-*llx");
DUMP(p, stop_sprs.ldbar, "%#-*llx");
DUMP(p, stop_sprs.fscr, "%#-*llx");
DUMP(p, stop_sprs.hfscr, "%#-*llx");
DUMP(p, stop_sprs.mmcr1, "%#-*llx");
DUMP(p, stop_sprs.mmcr2, "%#-*llx");
DUMP(p, stop_sprs.mmcra, "%#-*llx");
DUMP(p, dont_stop.counter, "%#-*x");
DUMP(p, idle_state, "%#-*lx");
if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) {
DUMP(p, thread_idle_state, "%#-*x");
DUMP(p, subcore_sibling_mask, "%#-*x");
} else {
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
DUMP(p, requested_psscr, "%#-*llx");
DUMP(p, dont_stop.counter, "%#-*x");
#endif
}
#endif
DUMP(p, accounting.utime, "%#-*lx");