s390/fpu: limit save and restore to used registers

The first invocation of kernel_fpu_begin() after switching from user to
kernel context will save all vector registers, even if only parts of the
vector registers are used within the kernel fpu context. Given that save
and restore of all vector registers is quite expensive change the current
approach in several ways:

- Instead of saving and restoring all user registers limit this to those
  registers which are actually used within an kernel fpu context.

- On context switch save all remaining user fpu registers, so they can be
  restored when the task is rescheduled.

- Saving user registers within kernel_fpu_begin() is done without disabling
  and enabling interrupts - which also slightly reduces runtime. In worst
  case (e.g. interrupt context uses the same registers) this may lead to
  the situation that registers are saved several times, however the
  assumption is that this will not happen frequently, so that the new
  method is faster in nearly all cases.

- save_user_fpu_regs() can still be called from all contexts and saves all
  (or all remaining) user registers to a tasks ufpu user fpu save area.

Overall this reduces the time required to save and restore the user fpu
context for nearly all cases.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
This commit is contained in:
Heiko Carstens 2024-02-03 11:45:18 +01:00
parent 066c40918b
commit 8c09871a95
4 changed files with 130 additions and 71 deletions

View File

@ -41,8 +41,7 @@ static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs,
static __always_inline void arch_exit_to_user_mode(void)
{
if (test_thread_flag(TIF_FPU))
__load_user_fpu_regs();
load_user_fpu_regs();
if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
debug_user_asce(1);

View File

@ -58,10 +58,6 @@ static inline bool cpu_has_vx(void)
return likely(test_facility(129));
}
void save_user_fpu_regs(void);
void load_user_fpu_regs(void);
void __load_user_fpu_regs(void);
enum {
KERNEL_FPC_BIT = 0,
KERNEL_VXR_V0V7_BIT,
@ -83,6 +79,8 @@ enum {
#define KERNEL_VXR (KERNEL_VXR_LOW | KERNEL_VXR_HIGH)
#define KERNEL_FPR (KERNEL_FPC | KERNEL_VXR_LOW)
void load_fpu_state(struct fpu *state, int flags);
void save_fpu_state(struct fpu *state, int flags);
void __kernel_fpu_begin(struct kernel_fpu *state, int flags);
void __kernel_fpu_end(struct kernel_fpu *state, int flags);
@ -162,26 +160,57 @@ static __always_inline void load_fp_regs_vx(__vector128 *vxrs)
__load_fp_regs(fprs, sizeof(__vector128) / sizeof(freg_t));
}
static inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
static inline void load_user_fpu_regs(void)
{
state->hdr.mask = READ_ONCE(current->thread.kfpu_flags);
if (!test_thread_flag(TIF_FPU)) {
/* Save user space FPU state and register contents */
save_user_fpu_regs();
} else if (state->hdr.mask & flags) {
/* Save FPU/vector register in-use by the kernel */
__kernel_fpu_begin(state, flags);
}
__atomic_or(flags, &current->thread.kfpu_flags);
struct thread_struct *thread = &current->thread;
if (!thread->ufpu_flags)
return;
load_fpu_state(&thread->ufpu, thread->ufpu_flags);
thread->ufpu_flags = 0;
}
static inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
static __always_inline void __save_user_fpu_regs(struct thread_struct *thread, int flags)
{
WRITE_ONCE(current->thread.kfpu_flags, state->hdr.mask);
if (state->hdr.mask & flags) {
/* Restore FPU/vector register in-use by the kernel */
save_fpu_state(&thread->ufpu, flags);
__atomic_or(flags, &thread->ufpu_flags);
}
static inline void save_user_fpu_regs(void)
{
struct thread_struct *thread = &current->thread;
int mask, flags;
mask = __atomic_or(KERNEL_FPC | KERNEL_VXR, &thread->kfpu_flags);
flags = ~READ_ONCE(thread->ufpu_flags) & (KERNEL_FPC | KERNEL_VXR);
if (flags)
__save_user_fpu_regs(thread, flags);
barrier();
WRITE_ONCE(thread->kfpu_flags, mask);
}
static __always_inline void _kernel_fpu_begin(struct kernel_fpu *state, int flags)
{
struct thread_struct *thread = &current->thread;
int mask, uflags;
mask = __atomic_or(flags, &thread->kfpu_flags);
state->hdr.mask = mask;
uflags = READ_ONCE(thread->ufpu_flags);
if ((uflags & flags) != flags)
__save_user_fpu_regs(thread, ~uflags & flags);
if (mask & flags)
__kernel_fpu_begin(state, flags);
}
static __always_inline void _kernel_fpu_end(struct kernel_fpu *state, int flags)
{
int mask = state->hdr.mask;
if (mask & flags)
__kernel_fpu_end(state, flags);
}
barrier();
WRITE_ONCE(current->thread.kfpu_flags, mask);
}
void __kernel_fpu_invalid_size(void);
@ -222,28 +251,16 @@ static __always_inline void kernel_fpu_check_size(int flags, unsigned int size)
static inline void save_kernel_fpu_regs(struct thread_struct *thread)
{
struct fpu *state = &thread->kfpu;
if (!thread->kfpu_flags)
return;
fpu_stfpc(&state->fpc);
if (likely(cpu_has_vx()))
save_vx_regs(state->vxrs);
else
save_fp_regs_vx(state->vxrs);
save_fpu_state(&thread->kfpu, thread->kfpu_flags);
}
static inline void restore_kernel_fpu_regs(struct thread_struct *thread)
{
struct fpu *state = &thread->kfpu;
if (!thread->kfpu_flags)
return;
fpu_lfpc(&state->fpc);
if (likely(cpu_has_vx()))
load_vx_regs(state->vxrs);
else
load_fp_regs_vx(state->vxrs);
load_fpu_state(&thread->kfpu, thread->kfpu_flags);
}
static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs)

View File

@ -166,6 +166,7 @@ struct thread_struct {
unsigned int gmap_write_flag; /* gmap fault write indication */
unsigned int gmap_int_code; /* int code of last gmap fault */
unsigned int gmap_pfault; /* signal of a pending guest pfault */
int ufpu_flags; /* user fpu flags */
int kfpu_flags; /* kernel fpu flags */
/* Per-thread information related to debugging */

View File

@ -107,45 +107,87 @@ void __kernel_fpu_end(struct kernel_fpu *state, int flags)
}
EXPORT_SYMBOL(__kernel_fpu_end);
void __load_user_fpu_regs(void)
void load_fpu_state(struct fpu *state, int flags)
{
struct fpu *state = &current->thread.ufpu;
__vector128 *vxrs = &state->vxrs[0];
int mask;
fpu_lfpc_safe(&state->fpc);
if (likely(cpu_has_vx()))
load_vx_regs(state->vxrs);
else
load_fp_regs_vx(state->vxrs);
clear_thread_flag(TIF_FPU);
if (flags & KERNEL_FPC)
fpu_lfpc(&state->fpc);
if (!cpu_has_vx()) {
if (flags & KERNEL_VXR_V0V7)
load_fp_regs_vx(state->vxrs);
return;
}
mask = flags & KERNEL_VXR;
if (mask == KERNEL_VXR) {
fpu_vlm(0, 15, &vxrs[0]);
fpu_vlm(16, 31, &vxrs[16]);
return;
}
if (mask == KERNEL_VXR_MID) {
fpu_vlm(8, 23, &vxrs[8]);
return;
}
mask = flags & KERNEL_VXR_LOW;
if (mask) {
if (mask == KERNEL_VXR_LOW)
fpu_vlm(0, 15, &vxrs[0]);
else if (mask == KERNEL_VXR_V0V7)
fpu_vlm(0, 7, &vxrs[0]);
else
fpu_vlm(8, 15, &vxrs[8]);
}
mask = flags & KERNEL_VXR_HIGH;
if (mask) {
if (mask == KERNEL_VXR_HIGH)
fpu_vlm(16, 31, &vxrs[16]);
else if (mask == KERNEL_VXR_V16V23)
fpu_vlm(16, 23, &vxrs[16]);
else
fpu_vlm(24, 31, &vxrs[24]);
}
}
void load_user_fpu_regs(void)
void save_fpu_state(struct fpu *state, int flags)
{
raw_local_irq_disable();
__load_user_fpu_regs();
raw_local_irq_enable();
__vector128 *vxrs = &state->vxrs[0];
int mask;
if (flags & KERNEL_FPC)
fpu_stfpc(&state->fpc);
if (!cpu_has_vx()) {
if (flags & KERNEL_VXR_LOW)
save_fp_regs_vx(state->vxrs);
return;
}
mask = flags & KERNEL_VXR;
if (mask == KERNEL_VXR) {
fpu_vstm(0, 15, &vxrs[0]);
fpu_vstm(16, 31, &vxrs[16]);
return;
}
if (mask == KERNEL_VXR_MID) {
fpu_vstm(8, 23, &vxrs[8]);
return;
}
mask = flags & KERNEL_VXR_LOW;
if (mask) {
if (mask == KERNEL_VXR_LOW)
fpu_vstm(0, 15, &vxrs[0]);
else if (mask == KERNEL_VXR_V0V7)
fpu_vstm(0, 7, &vxrs[0]);
else
fpu_vstm(8, 15, &vxrs[8]);
}
mask = flags & KERNEL_VXR_HIGH;
if (mask) {
if (mask == KERNEL_VXR_HIGH)
fpu_vstm(16, 31, &vxrs[16]);
else if (mask == KERNEL_VXR_V16V23)
fpu_vstm(16, 23, &vxrs[16]);
else
fpu_vstm(24, 31, &vxrs[24]);
}
}
EXPORT_SYMBOL(load_user_fpu_regs);
void save_user_fpu_regs(void)
{
unsigned long flags;
struct fpu *state;
local_irq_save(flags);
if (test_thread_flag(TIF_FPU))
goto out;
state = &current->thread.ufpu;
fpu_stfpc(&state->fpc);
if (likely(cpu_has_vx()))
save_vx_regs(state->vxrs);
else
save_fp_regs_vx(state->vxrs);
set_thread_flag(TIF_FPU);
out:
local_irq_restore(flags);
}
EXPORT_SYMBOL(save_user_fpu_regs);
EXPORT_SYMBOL(save_fpu_state);