Pull branch 'for-rmk' of git://git.linaro.org/people/ardbiesheuvel/linux-arm into devel-stable

Comments from Ard Biesheuvel:

I have included two use cases that I have been using, XOR and RAID-6
checksumming. The former gets a 60% performance boost on the NEON, the
latter over 400%.

ARM: add support for kernel mode NEON

Adds kernel_neon_begin/end (renamed from kernel_vfp_begin/end in the
previous version to de-emphasize the VFP part as VFP code that needs
software assistance is not supported currently.)

Introduces <asm/neon.h> and the Kconfig symbol KERNEL_MODE_NEON. This
has been aligned with Catalin for arm64, so any NEON code that does
not use assembly but intrinsics or the GCC vectorizer (such as my
examples) can potentially be shared between arm and arm64 archs.

ARM: move VFP init to an earlier boot stage

This is needed so the NEON is enabled when the XOR and RAID-6 algo
boot time benchmarks are run.

ARM: be strict about FP exceptions in kernel mode

This adds a check to vfp_support_entry() to flag unsupported uses of
the NEON/VFP in kernel mode. FP exceptions (bounces) are flagged as
a bug, this is because of their potentially intermittent nature.
Exceptions caused by the fact that kernel_neon_begin has not been
called are just routed through the undef handler.

ARM: crypto: add NEON accelerated XOR implementation

This is the xor_blocks() implementation built with -ftree-vectorize,
60% faster than optimized ARM code. It calls in_interrupt() to check
whether the NEON flavor can be used: this should really not be
necessary, but due to xor_blocks'squite generic nature, there is no
telling how exactly people may be using it in the real world.

lib/raid6: add ARM-NEON accelerated syndrome calculation

This is a port of the RAID-6 checksumming code in altivec.uc ported
to use NEON intrinsics. It is about 4x faster than the sequential
code.
This commit is contained in:
Russell King 2013-07-22 17:26:27 +01:00
commit b4f656eea6
14 changed files with 452 additions and 2 deletions

View File

@ -2176,6 +2176,13 @@ config NEON
Say Y to include support code for NEON, the ARMv7 Advanced SIMD
Extension.
config KERNEL_MODE_NEON
bool "Support for NEON in kernel mode"
default n
depends on NEON
help
Say Y to include support for NEON in kernel mode.
endmenu
menu "Userspace binary formats"

View File

@ -0,0 +1,36 @@
/*
* linux/arch/arm/include/asm/neon.h
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <asm/hwcap.h>
#define cpu_has_neon() (!!(elf_hwcap & HWCAP_NEON))
#ifdef __ARM_NEON__
/*
* If you are affected by the BUILD_BUG below, it probably means that you are
* using NEON code /and/ calling the kernel_neon_begin() function from the same
* compilation unit. To prevent issues that may arise from GCC reordering or
* generating(1) NEON instructions outside of these begin/end functions, the
* only supported way of using NEON code in the kernel is by isolating it in a
* separate compilation unit, and calling it from another unit from inside a
* kernel_neon_begin/kernel_neon_end pair.
*
* (1) Current GCC (4.7) might generate NEON instructions at O3 level if
* -mpfu=neon is set.
*/
#define kernel_neon_begin() \
BUILD_BUG_ON_MSG(1, "kernel_neon_begin() called from NEON code")
#else
void kernel_neon_begin(void);
#endif
void kernel_neon_end(void);

View File

@ -7,7 +7,10 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/hardirq.h>
#include <asm-generic/xor.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#define __XOR(a1, a2) a1 ^= a2
@ -138,4 +141,74 @@ static struct xor_block_template xor_block_arm4regs = {
xor_speed(&xor_block_arm4regs); \
xor_speed(&xor_block_8regs); \
xor_speed(&xor_block_32regs); \
NEON_TEMPLATES; \
} while (0)
#ifdef CONFIG_KERNEL_MODE_NEON
extern struct xor_block_template const xor_block_neon_inner;
static void
xor_neon_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
{
if (in_interrupt()) {
xor_arm4regs_2(bytes, p1, p2);
} else {
kernel_neon_begin();
xor_block_neon_inner.do_2(bytes, p1, p2);
kernel_neon_end();
}
}
static void
xor_neon_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3)
{
if (in_interrupt()) {
xor_arm4regs_3(bytes, p1, p2, p3);
} else {
kernel_neon_begin();
xor_block_neon_inner.do_3(bytes, p1, p2, p3);
kernel_neon_end();
}
}
static void
xor_neon_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4)
{
if (in_interrupt()) {
xor_arm4regs_4(bytes, p1, p2, p3, p4);
} else {
kernel_neon_begin();
xor_block_neon_inner.do_4(bytes, p1, p2, p3, p4);
kernel_neon_end();
}
}
static void
xor_neon_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
unsigned long *p3, unsigned long *p4, unsigned long *p5)
{
if (in_interrupt()) {
xor_arm4regs_5(bytes, p1, p2, p3, p4, p5);
} else {
kernel_neon_begin();
xor_block_neon_inner.do_5(bytes, p1, p2, p3, p4, p5);
kernel_neon_end();
}
}
static struct xor_block_template xor_block_neon = {
.name = "neon",
.do_2 = xor_neon_2,
.do_3 = xor_neon_3,
.do_4 = xor_neon_4,
.do_5 = xor_neon_5
};
#define NEON_TEMPLATES \
do { if (cpu_has_neon()) xor_speed(&xor_block_neon); } while (0)
#else
#define NEON_TEMPLATES
#endif

View File

@ -45,3 +45,9 @@ lib-$(CONFIG_ARCH_SHARK) += io-shark.o
$(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S
$(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S
ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
NEON_FLAGS := -mfloat-abi=softfp -mfpu=neon
CFLAGS_xor-neon.o += $(NEON_FLAGS)
lib-$(CONFIG_XOR_BLOCKS) += xor-neon.o
endif

42
arch/arm/lib/xor-neon.c Normal file
View File

@ -0,0 +1,42 @@
/*
* linux/arch/arm/lib/xor-neon.c
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/raid/xor.h>
#ifndef __ARM_NEON__
#error You should compile this file with '-mfloat-abi=softfp -mfpu=neon'
#endif
/*
* Pull in the reference implementations while instructing GCC (through
* -ftree-vectorize) to attempt to exploit implicit parallelism and emit
* NEON instructions.
*/
#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
#pragma GCC optimize "tree-vectorize"
#else
/*
* While older versions of GCC do not generate incorrect code, they fail to
* recognize the parallel nature of these functions, and emit plain ARM code,
* which is known to be slower than the optimized ARM code in asm-arm/xor.h.
*/
#warning This code requires at least version 4.6 of GCC
#endif
#pragma GCC diagnostic ignored "-Wunused-variable"
#include <asm-generic/xor.h>
struct xor_block_template const xor_block_neon_inner = {
.name = "__inner_neon__",
.do_2 = xor_8regs_2,
.do_3 = xor_8regs_3,
.do_4 = xor_8regs_4,
.do_5 = xor_8regs_5,
};

View File

@ -78,6 +78,11 @@
ENTRY(vfp_support_entry)
DBGSTR3 "instr %08x pc %08x state %p", r0, r2, r10
ldr r3, [sp, #S_PSR] @ Neither lazy restore nor FP exceptions
and r3, r3, #MODE_MASK @ are supported in kernel mode
teq r3, #USR_MODE
bne vfp_kmode_exception @ Returns through lr
VFPFMRX r1, FPEXC @ Is the VFP enabled?
DBGSTR1 "fpexc %08x", r1
tst r1, #FPEXC_EN

View File

@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/user.h>
#include <linux/export.h>
#include <asm/cp15.h>
#include <asm/cputype.h>
@ -648,6 +649,72 @@ static int vfp_hotplug(struct notifier_block *b, unsigned long action,
return NOTIFY_OK;
}
void vfp_kmode_exception(void)
{
/*
* If we reach this point, a floating point exception has been raised
* while running in kernel mode. If the NEON/VFP unit was enabled at the
* time, it means a VFP instruction has been issued that requires
* software assistance to complete, something which is not currently
* supported in kernel mode.
* If the NEON/VFP unit was disabled, and the location pointed to below
* is properly preceded by a call to kernel_neon_begin(), something has
* caused the task to be scheduled out and back in again. In this case,
* rebuilding and running with CONFIG_DEBUG_ATOMIC_SLEEP enabled should
* be helpful in localizing the problem.
*/
if (fmrx(FPEXC) & FPEXC_EN)
pr_crit("BUG: unsupported FP instruction in kernel mode\n");
else
pr_crit("BUG: FP instruction issued in kernel mode with FP unit disabled\n");
}
#ifdef CONFIG_KERNEL_MODE_NEON
/*
* Kernel-side NEON support functions
*/
void kernel_neon_begin(void)
{
struct thread_info *thread = current_thread_info();
unsigned int cpu;
u32 fpexc;
/*
* Kernel mode NEON is only allowed outside of interrupt context
* with preemption disabled. This will make sure that the kernel
* mode NEON register contents never need to be preserved.
*/
BUG_ON(in_interrupt());
cpu = get_cpu();
fpexc = fmrx(FPEXC) | FPEXC_EN;
fmxr(FPEXC, fpexc);
/*
* Save the userland NEON/VFP state. Under UP,
* the owner could be a task other than 'current'
*/
if (vfp_state_in_hw(cpu, thread))
vfp_save_state(&thread->vfpstate, fpexc);
#ifndef CONFIG_SMP
else if (vfp_current_hw_state[cpu] != NULL)
vfp_save_state(vfp_current_hw_state[cpu], fpexc);
#endif
vfp_current_hw_state[cpu] = NULL;
}
EXPORT_SYMBOL(kernel_neon_begin);
void kernel_neon_end(void)
{
/* Disable the NEON/VFP unit. */
fmxr(FPEXC, fmrx(FPEXC) & ~FPEXC_EN);
put_cpu();
}
EXPORT_SYMBOL(kernel_neon_end);
#endif /* CONFIG_KERNEL_MODE_NEON */
/*
* VFP support code initialisation.
*/
@ -731,4 +798,4 @@ static int __init vfp_init(void)
return 0;
}
late_initcall(vfp_init);
core_initcall(vfp_init);

View File

@ -114,6 +114,11 @@ extern const struct raid6_recov_calls raid6_recov_intx1;
extern const struct raid6_recov_calls raid6_recov_ssse3;
extern const struct raid6_recov_calls raid6_recov_avx2;
extern const struct raid6_calls raid6_neonx1;
extern const struct raid6_calls raid6_neonx2;
extern const struct raid6_calls raid6_neonx4;
extern const struct raid6_calls raid6_neonx8;
/* Algorithm list */
extern const struct raid6_calls * const raid6_algos[];
extern const struct raid6_recov_calls *const raid6_recov_algos[];

View File

@ -2,3 +2,4 @@ mktables
altivec*.c
int*.c
tables.c
neon?.c

View File

@ -5,6 +5,7 @@ raid6_pq-y += algos.o recov.o tables.o int1.o int2.o int4.o \
raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
hostprogs-y += mktables
@ -16,6 +17,21 @@ ifeq ($(CONFIG_ALTIVEC),y)
altivec_flags := -maltivec -mabi=altivec
endif
# The GCC option -ffreestanding is required in order to compile code containing
# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
NEON_FLAGS := -ffreestanding
ifeq ($(ARCH),arm)
NEON_FLAGS += -mfloat-abi=softfp -mfpu=neon
endif
ifeq ($(ARCH),arm64)
CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
endif
endif
targets += int1.c
$(obj)/int1.c: UNROLL := 1
$(obj)/int1.c: $(src)/int.uc $(src)/unroll.awk FORCE
@ -70,6 +86,30 @@ $(obj)/altivec8.c: UNROLL := 8
$(obj)/altivec8.c: $(src)/altivec.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_neon1.o += $(NEON_FLAGS)
targets += neon1.c
$(obj)/neon1.c: UNROLL := 1
$(obj)/neon1.c: $(src)/neon.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_neon2.o += $(NEON_FLAGS)
targets += neon2.c
$(obj)/neon2.c: UNROLL := 2
$(obj)/neon2.c: $(src)/neon.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_neon4.o += $(NEON_FLAGS)
targets += neon4.c
$(obj)/neon4.c: UNROLL := 4
$(obj)/neon4.c: $(src)/neon.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
CFLAGS_neon8.o += $(NEON_FLAGS)
targets += neon8.c
$(obj)/neon8.c: UNROLL := 8
$(obj)/neon8.c: $(src)/neon.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
quiet_cmd_mktable = TABLE $@
cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )

View File

@ -70,6 +70,12 @@ const struct raid6_calls * const raid6_algos[] = {
&raid6_intx2,
&raid6_intx4,
&raid6_intx8,
#ifdef CONFIG_KERNEL_MODE_NEON
&raid6_neonx1,
&raid6_neonx2,
&raid6_neonx4,
&raid6_neonx8,
#endif
NULL
};

58
lib/raid6/neon.c Normal file
View File

@ -0,0 +1,58 @@
/*
* linux/lib/raid6/neon.c - RAID6 syndrome calculation using ARM NEON intrinsics
*
* Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/raid/pq.h>
#ifdef __KERNEL__
#include <asm/neon.h>
#else
#define kernel_neon_begin()
#define kernel_neon_end()
#define cpu_has_neon() (1)
#endif
/*
* There are 2 reasons these wrappers are kept in a separate compilation unit
* from the actual implementations in neonN.c (generated from neon.uc by
* unroll.awk):
* - the actual implementations use NEON intrinsics, and the GCC support header
* (arm_neon.h) is not fully compatible (type wise) with the kernel;
* - the neonN.c files are compiled with -mfpu=neon and optimization enabled,
* and we have to make sure that we never use *any* NEON/VFP instructions
* outside a kernel_neon_begin()/kernel_neon_end() pair.
*/
#define RAID6_NEON_WRAPPER(_n) \
static void raid6_neon ## _n ## _gen_syndrome(int disks, \
size_t bytes, void **ptrs) \
{ \
void raid6_neon ## _n ## _gen_syndrome_real(int, \
unsigned long, void**); \
kernel_neon_begin(); \
raid6_neon ## _n ## _gen_syndrome_real(disks, \
(unsigned long)bytes, ptrs); \
kernel_neon_end(); \
} \
struct raid6_calls const raid6_neonx ## _n = { \
raid6_neon ## _n ## _gen_syndrome, \
raid6_have_neon, \
"neonx" #_n, \
0 \
}
static int raid6_have_neon(void)
{
return cpu_has_neon();
}
RAID6_NEON_WRAPPER(1);
RAID6_NEON_WRAPPER(2);
RAID6_NEON_WRAPPER(4);
RAID6_NEON_WRAPPER(8);

80
lib/raid6/neon.uc Normal file
View File

@ -0,0 +1,80 @@
/* -----------------------------------------------------------------------
*
* neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
*
* Copyright (C) 2012 Rob Herring
*
* Based on altivec.uc:
* Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, Inc., 53 Temple Place Ste 330,
* Boston MA 02111-1307, USA; either version 2 of the License, or
* (at your option) any later version; incorporated herein by reference.
*
* ----------------------------------------------------------------------- */
/*
* neon$#.c
*
* $#-way unrolled NEON intrinsics math RAID-6 instruction set
*
* This file is postprocessed using unroll.awk
*/
#include <arm_neon.h>
typedef uint8x16_t unative_t;
#define NBYTES(x) ((unative_t){x,x,x,x, x,x,x,x, x,x,x,x, x,x,x,x})
#define NSIZE sizeof(unative_t)
/*
* The SHLBYTE() operation shifts each byte left by 1, *not*
* rolling over into the next byte
*/
static inline unative_t SHLBYTE(unative_t v)
{
return vshlq_n_u8(v, 1);
}
/*
* The MASK() operation returns 0xFF in any byte for which the high
* bit is 1, 0x00 for any byte for which the high bit is 0.
*/
static inline unative_t MASK(unative_t v)
{
const uint8x16_t temp = NBYTES(0);
return (unative_t)vcltq_s8((int8x16_t)v, (int8x16_t)temp);
}
void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
{
uint8_t **dptr = (uint8_t **)ptrs;
uint8_t *p, *q;
int d, z, z0;
register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
const unative_t x1d = NBYTES(0x1d);
z0 = disks - 3; /* Highest data disk */
p = dptr[z0+1]; /* XOR parity */
q = dptr[z0+2]; /* RS syndrome */
for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
wq$$ = wp$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
for ( z = z0-1 ; z >= 0 ; z-- ) {
wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
wp$$ = veorq_u8(wp$$, wd$$);
w2$$ = MASK(wq$$);
w1$$ = SHLBYTE(wq$$);
w2$$ = vandq_u8(w2$$, x1d);
w1$$ = veorq_u8(w1$$, w2$$);
wq$$ = veorq_u8(w1$$, wd$$);
}
vst1q_u8(&p[d+NSIZE*$$], wp$$);
vst1q_u8(&q[d+NSIZE*$$], wq$$);
}
}

View File

@ -22,11 +22,23 @@ ifeq ($(ARCH),x86_64)
IS_X86 = yes
endif
ifeq ($(ARCH),arm)
CFLAGS += -I../../../arch/arm/include -mfpu=neon
HAS_NEON = yes
endif
ifeq ($(ARCH),arm64)
CFLAGS += -I../../../arch/arm64/include
HAS_NEON = yes
endif
ifeq ($(IS_X86),yes)
OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o
CFLAGS += $(shell echo "vpbroadcastb %xmm0, %ymm1" | \
gcc -c -x assembler - >&/dev/null && \
rm ./-.o && echo -DCONFIG_AS_AVX2=1)
else ifeq ($(HAS_NEON),yes)
OBJS += neon.o neon1.o neon2.o neon4.o neon8.o
CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
else
HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\
gcc -c -x c - >&/dev/null && \
@ -55,6 +67,18 @@ raid6.a: $(OBJS)
raid6test: test.c raid6.a
$(CC) $(CFLAGS) -o raid6test $^
neon1.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < neon.uc > $@
neon2.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=2 < neon.uc > $@
neon4.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=4 < neon.uc > $@
neon8.c: neon.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=8 < neon.uc > $@
altivec1.c: altivec.uc ../unroll.awk
$(AWK) ../unroll.awk -vN=1 < altivec.uc > $@
@ -89,7 +113,7 @@ tables.c: mktables
./mktables > tables.c
clean:
rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c tables.c raid6test
rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test
spotless: clean
rm -f *~