ARM: 9195/1: entry: avoid explicit literal loads

ARMv7 has MOVW/MOVT instruction pairs to load symbol addresses into
registers without having to rely on literal loads that go via the
D-cache.  For older cores, we now support a similar arrangement, based
on PC-relative group relocations.

This means we can elide most literal loads entirely from the entry path,
by switching to the ldr_va macro to emit the appropriate sequence
depending on the target architecture revision.

While at it, switch to the bl_r macro for invoking the right PABT/DABT
helpers instead of setting the LR register explicitly, which does not
play well with cores that speculate across function returns.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
This commit is contained in:
Ard Biesheuvel 2022-04-20 09:41:31 +01:00 committed by Russell King (Oracle)
parent 952f033163
commit 508074607c
4 changed files with 18 additions and 50 deletions

View File

@ -666,12 +666,11 @@ THUMB( orr \reg , \reg , #PSR_T_BIT )
__adldst_l str, \src, \sym, \tmp, \cond
.endm
.macro __ldst_va, op, reg, tmp, sym, cond
.macro __ldst_va, op, reg, tmp, sym, cond, offset
#if __LINUX_ARM_ARCH__ >= 7 || \
!defined(CONFIG_ARM_HAS_GROUP_RELOCS) || \
(defined(MODULE) && defined(CONFIG_ARM_MODULE_PLTS))
mov_l \tmp, \sym, \cond
\op\cond \reg, [\tmp]
#else
/*
* Avoid a literal load, by emitting a sequence of ADD/LDR instructions
@ -683,20 +682,21 @@ THUMB( orr \reg , \reg , #PSR_T_BIT )
.reloc .L0_\@, R_ARM_ALU_PC_G0_NC, \sym
.reloc .L1_\@, R_ARM_ALU_PC_G1_NC, \sym
.reloc .L2_\@, R_ARM_LDR_PC_G2, \sym
.L0_\@: sub\cond \tmp, pc, #8
.L1_\@: sub\cond \tmp, \tmp, #4
.L2_\@: \op\cond \reg, [\tmp, #0]
.L0_\@: sub\cond \tmp, pc, #8 - \offset
.L1_\@: sub\cond \tmp, \tmp, #4 - \offset
.L2_\@:
#endif
\op\cond \reg, [\tmp, #\offset]
.endm
/*
* ldr_va - load a 32-bit word from the virtual address of \sym
*/
.macro ldr_va, rd:req, sym:req, cond, tmp
.macro ldr_va, rd:req, sym:req, cond, tmp, offset=0
.ifnb \tmp
__ldst_va ldr, \rd, \tmp, \sym, \cond
__ldst_va ldr, \rd, \tmp, \sym, \cond, \offset
.else
__ldst_va ldr, \rd, \rd, \sym, \cond
__ldst_va ldr, \rd, \rd, \sym, \cond, \offset
.endif
.endm
@ -704,7 +704,7 @@ THUMB( orr \reg , \reg , #PSR_T_BIT )
* str_va - store a 32-bit word to the virtual address of \sym
*/
.macro str_va, rn:req, sym:req, tmp:req, cond
__ldst_va str, \rn, \tmp, \sym, \cond
__ldst_va str, \rn, \tmp, \sym, \cond, 0
.endm
/*

View File

@ -61,9 +61,8 @@
.macro pabt_helper
@ PABORT handler takes pt_regs in r2, fault address in r4 and psr in r5
#ifdef MULTI_PABORT
ldr ip, .LCprocfns
mov lr, pc
ldr pc, [ip, #PROCESSOR_PABT_FUNC]
ldr_va ip, processor, offset=PROCESSOR_PABT_FUNC
bl_r ip
#else
bl CPU_PABORT_HANDLER
#endif
@ -82,9 +81,8 @@
@ the fault status register in r1. r9 must be preserved.
@
#ifdef MULTI_DABORT
ldr ip, .LCprocfns
mov lr, pc
ldr pc, [ip, #PROCESSOR_DABT_FUNC]
ldr_va ip, processor, offset=PROCESSOR_DABT_FUNC
bl_r ip
#else
bl CPU_DABORT_HANDLER
#endif
@ -302,16 +300,6 @@ __fiq_svc:
UNWIND(.fnend )
ENDPROC(__fiq_svc)
.align 5
.LCcralign:
.word cr_alignment
#ifdef MULTI_DABORT
.LCprocfns:
.word processor
#endif
.LCfp:
.word fp_enter
/*
* Abort mode handlers
*/
@ -370,7 +358,7 @@ ENDPROC(__fiq_abt)
THUMB( stmia sp, {r0 - r12} )
ATRAP( mrc p15, 0, r7, c1, c0, 0)
ATRAP( ldr r8, .LCcralign)
ATRAP( ldr_va r8, cr_alignment)
ldmia r0, {r3 - r5}
add r0, sp, #S_PC @ here for interlock avoidance
@ -379,8 +367,6 @@ ENDPROC(__fiq_abt)
str r3, [sp] @ save the "real" r0 copied
@ from the exception stack
ATRAP( ldr r8, [r8, #0])
@
@ We are now ready to fill in the remaining blanks on the stack:
@
@ -505,9 +491,7 @@ __und_usr_thumb:
*/
#if __LINUX_ARM_ARCH__ < 7
/* If the target CPU may not be Thumb-2-capable, a run-time check is needed: */
#define NEED_CPU_ARCHITECTURE
ldr r5, .LCcpu_architecture
ldr r5, [r5]
ldr_va r5, cpu_architecture
cmp r5, #CPU_ARCH_ARMv7
blo __und_usr_fault_16 @ 16bit undefined instruction
/*
@ -654,12 +638,6 @@ call_fpe:
ret.w lr @ CP#14 (Debug)
ret.w lr @ CP#15 (Control)
#ifdef NEED_CPU_ARCHITECTURE
.align 2
.LCcpu_architecture:
.word __cpu_architecture
#endif
#ifdef CONFIG_NEON
.align 6
@ -685,9 +663,8 @@ call_fpe:
#endif
do_fpe:
ldr r4, .LCfp
add r10, r10, #TI_FPSTATE @ r10 = workspace
ldr pc, [r4] @ Call FP module USR entry point
ldr_va pc, fp_enter, tmp=r4 @ Call FP module USR entry point
/*
* The FP module is called with these registers set:

View File

@ -198,7 +198,7 @@ ENTRY(vector_swi)
#endif
reload_current r10, ip
zero_fp
alignment_trap r10, ip, __cr_alignment
alignment_trap r10, ip, cr_alignment
asm_trace_hardirqs_on save=0
enable_irq_notrace
ct_user_exit save=0
@ -328,14 +328,6 @@ __sys_trace_return:
bl syscall_trace_exit
b ret_slow_syscall
.align 5
#ifdef CONFIG_ALIGNMENT_TRAP
.type __cr_alignment, #object
__cr_alignment:
.word cr_alignment
#endif
.ltorg
.macro syscall_table_start, sym
.equ __sys_nr, 0
.type \sym, #object

View File

@ -48,8 +48,7 @@
.macro alignment_trap, rtmp1, rtmp2, label
#ifdef CONFIG_ALIGNMENT_TRAP
mrc p15, 0, \rtmp2, c1, c0, 0
ldr \rtmp1, \label
ldr \rtmp1, [\rtmp1]
ldr_va \rtmp1, \label
teq \rtmp1, \rtmp2
mcrne p15, 0, \rtmp1, c1, c0, 0
#endif