2019-05-27 06:55:01 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2005-10-20 16:44:46 +00:00
|
|
|
#ifndef _ASM_POWERPC_CHECKSUM_H
|
|
|
|
#define _ASM_POWERPC_CHECKSUM_H
|
2005-12-16 21:43:46 +00:00
|
|
|
#ifdef __KERNEL__
|
2005-04-16 22:20:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
*/
|
|
|
|
|
powerpc/64: optimises from64to32()
The current implementation of from64to32() gives a poor result:
0000000000000270 <.from64to32>:
270: 38 00 ff ff li r0,-1
274: 78 69 00 22 rldicl r9,r3,32,32
278: 78 00 00 20 clrldi r0,r0,32
27c: 7c 60 00 38 and r0,r3,r0
280: 7c 09 02 14 add r0,r9,r0
284: 78 09 00 22 rldicl r9,r0,32,32
288: 7c 00 4a 14 add r0,r0,r9
28c: 78 03 00 20 clrldi r3,r0,32
290: 4e 80 00 20 blr
This patch modifies from64to32() to operate in the same
spirit as csum_fold()
It swaps the two 32-bit halves of sum then it adds it with the
unswapped sum. If there is a carry from adding the two 32-bit halves,
it will carry from the lower half into the upper half, giving us the
correct sum in the upper half.
The resulting code is:
0000000000000260 <.from64to32>:
260: 78 60 00 02 rotldi r0,r3,32
264: 7c 60 1a 14 add r3,r0,r3
268: 78 63 00 22 rldicl r3,r3,32,32
26c: 4e 80 00 20 blr
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-04-10 06:34:35 +00:00
|
|
|
#include <linux/bitops.h>
|
powerpc: Implement csum_ipv6_magic in assembly
The generic csum_ipv6_magic() generates a pretty bad result
00000000 <csum_ipv6_magic>: (PPC32)
0: 81 23 00 00 lwz r9,0(r3)
4: 81 03 00 04 lwz r8,4(r3)
8: 7c e7 4a 14 add r7,r7,r9
c: 7d 29 38 10 subfc r9,r9,r7
10: 7d 4a 51 10 subfe r10,r10,r10
14: 7d 27 42 14 add r9,r7,r8
18: 7d 2a 48 50 subf r9,r10,r9
1c: 80 e3 00 08 lwz r7,8(r3)
20: 7d 08 48 10 subfc r8,r8,r9
24: 7d 4a 51 10 subfe r10,r10,r10
28: 7d 29 3a 14 add r9,r9,r7
2c: 81 03 00 0c lwz r8,12(r3)
30: 7d 2a 48 50 subf r9,r10,r9
34: 7c e7 48 10 subfc r7,r7,r9
38: 7d 4a 51 10 subfe r10,r10,r10
3c: 7d 29 42 14 add r9,r9,r8
40: 7d 2a 48 50 subf r9,r10,r9
44: 80 e4 00 00 lwz r7,0(r4)
48: 7d 08 48 10 subfc r8,r8,r9
4c: 7d 4a 51 10 subfe r10,r10,r10
50: 7d 29 3a 14 add r9,r9,r7
54: 7d 2a 48 50 subf r9,r10,r9
58: 81 04 00 04 lwz r8,4(r4)
5c: 7c e7 48 10 subfc r7,r7,r9
60: 7d 4a 51 10 subfe r10,r10,r10
64: 7d 29 42 14 add r9,r9,r8
68: 7d 2a 48 50 subf r9,r10,r9
6c: 80 e4 00 08 lwz r7,8(r4)
70: 7d 08 48 10 subfc r8,r8,r9
74: 7d 4a 51 10 subfe r10,r10,r10
78: 7d 29 3a 14 add r9,r9,r7
7c: 7d 2a 48 50 subf r9,r10,r9
80: 81 04 00 0c lwz r8,12(r4)
84: 7c e7 48 10 subfc r7,r7,r9
88: 7d 4a 51 10 subfe r10,r10,r10
8c: 7d 29 42 14 add r9,r9,r8
90: 7d 2a 48 50 subf r9,r10,r9
94: 7d 08 48 10 subfc r8,r8,r9
98: 7d 4a 51 10 subfe r10,r10,r10
9c: 7d 29 2a 14 add r9,r9,r5
a0: 7d 2a 48 50 subf r9,r10,r9
a4: 7c a5 48 10 subfc r5,r5,r9
a8: 7c 63 19 10 subfe r3,r3,r3
ac: 7d 29 32 14 add r9,r9,r6
b0: 7d 23 48 50 subf r9,r3,r9
b4: 7c c6 48 10 subfc r6,r6,r9
b8: 7c 63 19 10 subfe r3,r3,r3
bc: 7c 63 48 50 subf r3,r3,r9
c0: 54 6a 80 3e rotlwi r10,r3,16
c4: 7c 63 52 14 add r3,r3,r10
c8: 7c 63 18 f8 not r3,r3
cc: 54 63 84 3e rlwinm r3,r3,16,16,31
d0: 4e 80 00 20 blr
0000000000000000 <.csum_ipv6_magic>: (PPC64)
0: 81 23 00 00 lwz r9,0(r3)
4: 80 03 00 04 lwz r0,4(r3)
8: 81 63 00 08 lwz r11,8(r3)
c: 7c e7 4a 14 add r7,r7,r9
10: 7f 89 38 40 cmplw cr7,r9,r7
14: 7d 47 02 14 add r10,r7,r0
18: 7d 30 10 26 mfocrf r9,1
1c: 55 29 f7 fe rlwinm r9,r9,30,31,31
20: 7d 4a 4a 14 add r10,r10,r9
24: 7f 80 50 40 cmplw cr7,r0,r10
28: 7d 2a 5a 14 add r9,r10,r11
2c: 80 03 00 0c lwz r0,12(r3)
30: 81 44 00 00 lwz r10,0(r4)
34: 7d 10 10 26 mfocrf r8,1
38: 55 08 f7 fe rlwinm r8,r8,30,31,31
3c: 7d 29 42 14 add r9,r9,r8
40: 81 04 00 04 lwz r8,4(r4)
44: 7f 8b 48 40 cmplw cr7,r11,r9
48: 7d 29 02 14 add r9,r9,r0
4c: 7d 70 10 26 mfocrf r11,1
50: 55 6b f7 fe rlwinm r11,r11,30,31,31
54: 7d 29 5a 14 add r9,r9,r11
58: 7f 80 48 40 cmplw cr7,r0,r9
5c: 7d 29 52 14 add r9,r9,r10
60: 7c 10 10 26 mfocrf r0,1
64: 54 00 f7 fe rlwinm r0,r0,30,31,31
68: 7d 69 02 14 add r11,r9,r0
6c: 7f 8a 58 40 cmplw cr7,r10,r11
70: 7c 0b 42 14 add r0,r11,r8
74: 81 44 00 08 lwz r10,8(r4)
78: 7c f0 10 26 mfocrf r7,1
7c: 54 e7 f7 fe rlwinm r7,r7,30,31,31
80: 7c 00 3a 14 add r0,r0,r7
84: 7f 88 00 40 cmplw cr7,r8,r0
88: 7d 20 52 14 add r9,r0,r10
8c: 80 04 00 0c lwz r0,12(r4)
90: 7d 70 10 26 mfocrf r11,1
94: 55 6b f7 fe rlwinm r11,r11,30,31,31
98: 7d 29 5a 14 add r9,r9,r11
9c: 7f 8a 48 40 cmplw cr7,r10,r9
a0: 7d 29 02 14 add r9,r9,r0
a4: 7d 70 10 26 mfocrf r11,1
a8: 55 6b f7 fe rlwinm r11,r11,30,31,31
ac: 7d 29 5a 14 add r9,r9,r11
b0: 7f 80 48 40 cmplw cr7,r0,r9
b4: 7d 29 2a 14 add r9,r9,r5
b8: 7c 10 10 26 mfocrf r0,1
bc: 54 00 f7 fe rlwinm r0,r0,30,31,31
c0: 7d 29 02 14 add r9,r9,r0
c4: 7f 85 48 40 cmplw cr7,r5,r9
c8: 7c 09 32 14 add r0,r9,r6
cc: 7d 50 10 26 mfocrf r10,1
d0: 55 4a f7 fe rlwinm r10,r10,30,31,31
d4: 7c 00 52 14 add r0,r0,r10
d8: 7f 80 30 40 cmplw cr7,r0,r6
dc: 7d 30 10 26 mfocrf r9,1
e0: 55 29 ef fe rlwinm r9,r9,29,31,31
e4: 7c 09 02 14 add r0,r9,r0
e8: 54 03 80 3e rotlwi r3,r0,16
ec: 7c 03 02 14 add r0,r3,r0
f0: 7c 03 00 f8 not r3,r0
f4: 78 63 84 22 rldicl r3,r3,48,48
f8: 4e 80 00 20 blr
This patch implements it in assembly for both PPC32 and PPC64
Link: https://github.com/linuxppc/linux/issues/9
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-05-24 11:33:18 +00:00
|
|
|
#include <linux/in6.h>
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
2005-10-20 16:44:46 +00:00
|
|
|
* Computes the checksum of a memory block at src, length len,
|
|
|
|
* and adds in "sum" (32-bit), while copying the block to dst.
|
|
|
|
* If an access exception occurs on src or dst, it stores -EFAULT
|
|
|
|
* to *src_err or *dst_err respectively (if that pointer is not
|
|
|
|
* NULL), and, for an error on src, zeroes the rest of dst.
|
|
|
|
*
|
|
|
|
* Like csum_partial, this must be called with even lengths,
|
|
|
|
* except for the last fragment.
|
2005-04-16 22:20:36 +00:00
|
|
|
*/
|
2020-07-20 14:09:24 +00:00
|
|
|
extern __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
|
2010-08-02 20:09:52 +00:00
|
|
|
|
|
|
|
#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER
|
|
|
|
extern __wsum csum_and_copy_from_user(const void __user *src, void *dst,
|
2020-07-11 04:27:49 +00:00
|
|
|
int len);
|
2010-08-02 20:11:36 +00:00
|
|
|
#define HAVE_CSUM_COPY_USER
|
|
|
|
extern __wsum csum_and_copy_to_user(const void *src, void __user *dst,
|
2020-07-11 04:27:49 +00:00
|
|
|
int len);
|
2005-10-20 16:44:46 +00:00
|
|
|
|
unify generic instances of csum_partial_copy_nocheck()
quite a few architectures have the same csum_partial_copy_nocheck() -
simply memcpy() the data and then return the csum of the copy.
hexagon, parisc, ia64, s390, um: explicitly spelled out that way.
arc, arm64, csky, h8300, m68k/nommu, microblaze, mips/GENERIC_CSUM, nds32,
nios2, openrisc, riscv, unicore32: end up picking the same thing spelled
out in lib/checksum.h (with varying amounts of perversions along the way).
everybody else (alpha, arm, c6x, m68k/mmu, mips/!GENERIC_CSUM, powerpc,
sh, sparc, x86, xtensa) have non-generic variants. For all except c6x
the declaration is in their asm/checksum.h. c6x uses the wrapper
from asm-generic/checksum.h that would normally lead to the lib/checksum.h
instance, but in case of c6x we end up using an asm function from arch/c6x
instead.
Screw that mess - have architectures with private instances define
_HAVE_ARCH_CSUM_AND_COPY in their asm/checksum.h and have the default
one right in net/checksum.h conditional on _HAVE_ARCH_CSUM_AND_COPY
*not* defined.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2020-07-23 02:14:36 +00:00
|
|
|
#define _HAVE_ARCH_CSUM_AND_COPY
|
2020-07-11 04:12:07 +00:00
|
|
|
#define csum_partial_copy_nocheck(src, dst, len) \
|
2020-07-20 14:09:24 +00:00
|
|
|
csum_partial_copy_generic((src), (dst), (len))
|
2005-10-20 16:44:46 +00:00
|
|
|
|
|
|
|
|
2005-04-16 22:20:36 +00:00
|
|
|
/*
|
|
|
|
* turns a 32-bit partial checksum (e.g. from csum_partial) into a
|
|
|
|
* 1's complement 16-bit checksum.
|
|
|
|
*/
|
2006-11-15 05:21:58 +00:00
|
|
|
static inline __sum16 csum_fold(__wsum sum)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2022-03-09 07:56:14 +00:00
|
|
|
u32 tmp = (__force u32)sum;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* swap the two 16-bit halves of sum
|
|
|
|
* if there is a carry from adding the two 16-bit halves,
|
|
|
|
* it will carry from the lower half into the upper half,
|
|
|
|
* giving us the correct sum in the upper half.
|
|
|
|
*/
|
|
|
|
return (__force __sum16)(~(tmp + rol32(tmp, 16)) >> 16);
|
2005-04-16 22:20:36 +00:00
|
|
|
}
|
|
|
|
|
2016-11-03 05:10:55 +00:00
|
|
|
static inline u32 from64to32(u64 x)
|
|
|
|
{
|
powerpc/64: optimises from64to32()
The current implementation of from64to32() gives a poor result:
0000000000000270 <.from64to32>:
270: 38 00 ff ff li r0,-1
274: 78 69 00 22 rldicl r9,r3,32,32
278: 78 00 00 20 clrldi r0,r0,32
27c: 7c 60 00 38 and r0,r3,r0
280: 7c 09 02 14 add r0,r9,r0
284: 78 09 00 22 rldicl r9,r0,32,32
288: 7c 00 4a 14 add r0,r0,r9
28c: 78 03 00 20 clrldi r3,r0,32
290: 4e 80 00 20 blr
This patch modifies from64to32() to operate in the same
spirit as csum_fold()
It swaps the two 32-bit halves of sum then it adds it with the
unswapped sum. If there is a carry from adding the two 32-bit halves,
it will carry from the lower half into the upper half, giving us the
correct sum in the upper half.
The resulting code is:
0000000000000260 <.from64to32>:
260: 78 60 00 02 rotldi r0,r3,32
264: 7c 60 1a 14 add r3,r0,r3
268: 78 63 00 22 rldicl r3,r3,32,32
26c: 4e 80 00 20 blr
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-04-10 06:34:35 +00:00
|
|
|
return (x + ror64(x, 32)) >> 32;
|
2016-11-03 05:10:55 +00:00
|
|
|
}
|
|
|
|
|
2016-10-27 14:30:06 +00:00
|
|
|
static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
|
|
|
|
__u8 proto, __wsum sum)
|
2005-04-16 22:20:36 +00:00
|
|
|
{
|
2006-11-15 05:21:58 +00:00
|
|
|
#ifdef __powerpc64__
|
2016-11-03 05:10:55 +00:00
|
|
|
u64 s = (__force u32)sum;
|
2005-04-16 22:20:36 +00:00
|
|
|
|
2006-11-15 05:21:58 +00:00
|
|
|
s += (__force u32)saddr;
|
|
|
|
s += (__force u32)daddr;
|
2016-11-03 05:15:42 +00:00
|
|
|
#ifdef __BIG_ENDIAN__
|
2006-11-15 05:21:58 +00:00
|
|
|
s += proto + len;
|
2016-11-03 05:15:42 +00:00
|
|
|
#else
|
|
|
|
s += (proto + len) << 8;
|
|
|
|
#endif
|
2016-11-03 05:10:55 +00:00
|
|
|
return (__force __wsum) from64to32(s);
|
2005-10-20 16:44:46 +00:00
|
|
|
#else
|
|
|
|
__asm__("\n\
|
|
|
|
addc %0,%0,%1 \n\
|
|
|
|
adde %0,%0,%2 \n\
|
|
|
|
adde %0,%0,%3 \n\
|
|
|
|
addze %0,%0 \n\
|
|
|
|
"
|
|
|
|
: "=r" (sum)
|
2006-11-15 05:21:58 +00:00
|
|
|
: "r" (daddr), "r"(saddr), "r"(proto + len), "0"(sum));
|
|
|
|
return sum;
|
2005-04-16 22:20:36 +00:00
|
|
|
#endif
|
2006-11-15 05:21:58 +00:00
|
|
|
}
|
2013-09-23 02:04:51 +00:00
|
|
|
|
2015-05-19 15:18:55 +00:00
|
|
|
/*
|
|
|
|
* computes the checksum of the TCP/UDP pseudo-header
|
|
|
|
* returns a 16-bit checksum, already complemented
|
|
|
|
*/
|
2016-10-27 14:30:06 +00:00
|
|
|
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len,
|
|
|
|
__u8 proto, __wsum sum)
|
2015-05-19 15:18:55 +00:00
|
|
|
{
|
|
|
|
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
|
|
|
|
}
|
|
|
|
|
powerpc: add support for csum_add()
The C version of csum_add() as defined in include/net/checksum.h gives
the following assembly in ppc32:
0: 7c 04 1a 14 add r0,r4,r3
4: 7c 64 00 10 subfc r3,r4,r0
8: 7c 63 19 10 subfe r3,r3,r3
c: 7c 63 00 50 subf r3,r3,r0
and the following in ppc64:
0xc000000000001af8 <+0>: add r3,r3,r4
0xc000000000001afc <+4>: cmplw cr7,r3,r4
0xc000000000001b00 <+8>: mfcr r4
0xc000000000001b04 <+12>: rlwinm r4,r4,29,31,31
0xc000000000001b08 <+16>: add r3,r4,r3
0xc000000000001b0c <+20>: clrldi r3,r3,32
0xc000000000001b10 <+24>: blr
include/net/checksum.h also offers the possibility to define an arch
specific function. This patch provides a specific csum_add() inline
function.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2015-05-19 15:18:57 +00:00
|
|
|
#define HAVE_ARCH_CSUM_ADD
|
powerpc: Force inlining of csum_add()
Commit 328e7e487a46 ("powerpc: force inlining of csum_partial() to
avoid multiple csum_partial() with GCC10") inlined csum_partial().
Now that csum_partial() is inlined, GCC outlines csum_add() when
called by csum_partial().
c064fb28 <csum_add>:
c064fb28: 7c 63 20 14 addc r3,r3,r4
c064fb2c: 7c 63 01 94 addze r3,r3
c064fb30: 4e 80 00 20 blr
c0665fb8 <csum_add>:
c0665fb8: 7c 63 20 14 addc r3,r3,r4
c0665fbc: 7c 63 01 94 addze r3,r3
c0665fc0: 4e 80 00 20 blr
c066719c: 7c 9a c0 2e lwzx r4,r26,r24
c06671a0: 38 60 00 00 li r3,0
c06671a4: 7f 1a c2 14 add r24,r26,r24
c06671a8: 4b ff ee 11 bl c0665fb8 <csum_add>
c06671ac: 80 98 00 04 lwz r4,4(r24)
c06671b0: 4b ff ee 09 bl c0665fb8 <csum_add>
c06671b4: 80 98 00 08 lwz r4,8(r24)
c06671b8: 4b ff ee 01 bl c0665fb8 <csum_add>
c06671bc: a0 98 00 0c lhz r4,12(r24)
c06671c0: 4b ff ed f9 bl c0665fb8 <csum_add>
c06671c4: 7c 63 18 f8 not r3,r3
c06671c8: 81 3f 00 68 lwz r9,104(r31)
c06671cc: 81 5f 00 a0 lwz r10,160(r31)
c06671d0: 7d 29 18 14 addc r9,r9,r3
c06671d4: 7d 29 01 94 addze r9,r9
c06671d8: 91 3f 00 68 stw r9,104(r31)
c06671dc: 7d 1a 50 50 subf r8,r26,r10
c06671e0: 83 01 00 10 lwz r24,16(r1)
c06671e4: 83 41 00 18 lwz r26,24(r1)
The sum with 0 is useless, should have been skipped.
And there is even one completely unused instance of csum_add().
In file included from ./include/net/checksum.h:22,
from ./include/linux/skbuff.h:28,
from ./include/linux/icmp.h:16,
from net/ipv6/ip6_tunnel.c:23:
./arch/powerpc/include/asm/checksum.h: In function '__ip6_tnl_rcv':
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
| ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:172:31: note: called from here
172 | sum = csum_add(sum, (__force __wsum)*(const u32 *)buff);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
| ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:177:31: note: called from here
177 | sum = csum_add(sum, (__force __wsum)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
178 | *(const u32 *)(buff + 4));
| ~~~~~~~~~~~~~~~~~~~~~~~~~
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
| ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:183:31: note: called from here
183 | sum = csum_add(sum, (__force __wsum)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
184 | *(const u32 *)(buff + 8));
| ~~~~~~~~~~~~~~~~~~~~~~~~~
./arch/powerpc/include/asm/checksum.h:94:22: warning: inlining failed in call to 'csum_add': call is unlikely and code size would grow [-Winline]
94 | static inline __wsum csum_add(__wsum csum, __wsum addend)
| ^~~~~~~~
./arch/powerpc/include/asm/checksum.h:186:31: note: called from here
186 | sum = csum_add(sum, (__force __wsum)
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
187 | *(const u16 *)(buff + 12));
| ~~~~~~~~~~~~~~~~~~~~~~~~~~
Force inlining of csum_add().
94c: 80 df 00 a0 lwz r6,160(r31)
950: 7d 28 50 2e lwzx r9,r8,r10
954: 7d 48 52 14 add r10,r8,r10
958: 80 aa 00 04 lwz r5,4(r10)
95c: 80 ff 00 68 lwz r7,104(r31)
960: 7d 29 28 14 addc r9,r9,r5
964: 7d 29 01 94 addze r9,r9
968: 7d 08 30 50 subf r8,r8,r6
96c: 80 aa 00 08 lwz r5,8(r10)
970: a1 4a 00 0c lhz r10,12(r10)
974: 7d 29 28 14 addc r9,r9,r5
978: 7d 29 01 94 addze r9,r9
97c: 7d 29 50 14 addc r9,r9,r10
980: 7d 29 01 94 addze r9,r9
984: 7d 29 48 f8 not r9,r9
988: 7c e7 48 14 addc r7,r7,r9
98c: 7c e7 01 94 addze r7,r7
990: 90 ff 00 68 stw r7,104(r31)
In the non-inlined version, the first sum with 0 was performed.
Here it is skipped.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/f7f4d4e364de6e473da874468b903da6e5d97adc.1620713272.git.christophe.leroy@csgroup.eu
2021-05-11 06:08:06 +00:00
|
|
|
static __always_inline __wsum csum_add(__wsum csum, __wsum addend)
|
powerpc: add support for csum_add()
The C version of csum_add() as defined in include/net/checksum.h gives
the following assembly in ppc32:
0: 7c 04 1a 14 add r0,r4,r3
4: 7c 64 00 10 subfc r3,r4,r0
8: 7c 63 19 10 subfe r3,r3,r3
c: 7c 63 00 50 subf r3,r3,r0
and the following in ppc64:
0xc000000000001af8 <+0>: add r3,r3,r4
0xc000000000001afc <+4>: cmplw cr7,r3,r4
0xc000000000001b00 <+8>: mfcr r4
0xc000000000001b04 <+12>: rlwinm r4,r4,29,31,31
0xc000000000001b08 <+16>: add r3,r4,r3
0xc000000000001b0c <+20>: clrldi r3,r3,32
0xc000000000001b10 <+24>: blr
include/net/checksum.h also offers the possibility to define an arch
specific function. This patch provides a specific csum_add() inline
function.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2015-05-19 15:18:57 +00:00
|
|
|
{
|
|
|
|
#ifdef __powerpc64__
|
|
|
|
u64 res = (__force u64)csum;
|
2022-02-12 07:36:17 +00:00
|
|
|
|
|
|
|
res += (__force u64)addend;
|
|
|
|
return (__force __wsum)((u32)res + (res >> 32));
|
|
|
|
#else
|
2015-09-22 14:34:34 +00:00
|
|
|
if (__builtin_constant_p(csum) && csum == 0)
|
|
|
|
return addend;
|
|
|
|
if (__builtin_constant_p(addend) && addend == 0)
|
|
|
|
return csum;
|
powerpc: add support for csum_add()
The C version of csum_add() as defined in include/net/checksum.h gives
the following assembly in ppc32:
0: 7c 04 1a 14 add r0,r4,r3
4: 7c 64 00 10 subfc r3,r4,r0
8: 7c 63 19 10 subfe r3,r3,r3
c: 7c 63 00 50 subf r3,r3,r0
and the following in ppc64:
0xc000000000001af8 <+0>: add r3,r3,r4
0xc000000000001afc <+4>: cmplw cr7,r3,r4
0xc000000000001b00 <+8>: mfcr r4
0xc000000000001b04 <+12>: rlwinm r4,r4,29,31,31
0xc000000000001b08 <+16>: add r3,r4,r3
0xc000000000001b0c <+20>: clrldi r3,r3,32
0xc000000000001b10 <+24>: blr
include/net/checksum.h also offers the possibility to define an arch
specific function. This patch provides a specific csum_add() inline
function.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2015-05-19 15:18:57 +00:00
|
|
|
|
|
|
|
asm("addc %0,%0,%1;"
|
|
|
|
"addze %0,%0;"
|
2015-09-22 14:34:21 +00:00
|
|
|
: "+r" (csum) : "r" (addend) : "xer");
|
powerpc: add support for csum_add()
The C version of csum_add() as defined in include/net/checksum.h gives
the following assembly in ppc32:
0: 7c 04 1a 14 add r0,r4,r3
4: 7c 64 00 10 subfc r3,r4,r0
8: 7c 63 19 10 subfe r3,r3,r3
c: 7c 63 00 50 subf r3,r3,r0
and the following in ppc64:
0xc000000000001af8 <+0>: add r3,r3,r4
0xc000000000001afc <+4>: cmplw cr7,r3,r4
0xc000000000001b00 <+8>: mfcr r4
0xc000000000001b04 <+12>: rlwinm r4,r4,29,31,31
0xc000000000001b08 <+16>: add r3,r4,r3
0xc000000000001b0c <+20>: clrldi r3,r3,32
0xc000000000001b10 <+24>: blr
include/net/checksum.h also offers the possibility to define an arch
specific function. This patch provides a specific csum_add() inline
function.
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Scott Wood <scottwood@freescale.com>
2015-05-19 15:18:57 +00:00
|
|
|
return csum;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
powerpc/net: Implement powerpc specific csum_shift() to remove branch
Today's implementation of csum_shift() leads to branching based on
parity of 'offset'
000002f8 <csum_block_add>:
2f8: 70 a5 00 01 andi. r5,r5,1
2fc: 41 a2 00 08 beq 304 <csum_block_add+0xc>
300: 54 84 c0 3e rotlwi r4,r4,24
304: 7c 63 20 14 addc r3,r3,r4
308: 7c 63 01 94 addze r3,r3
30c: 4e 80 00 20 blr
Use first bit of 'offset' directly as input of the rotation instead of
branching.
000002f8 <csum_block_add>:
2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28
2fc: 20 a5 00 20 subfic r5,r5,32
300: 5c 84 28 3e rotlw r4,r4,r5
304: 7c 63 20 14 addc r3,r3,r4
308: 7c 63 01 94 addze r3,r3
30c: 4e 80 00 20 blr
And change to left shift instead of right shift to skip one more
instruction. This has no impact on the final sum.
000002f8 <csum_block_add>:
2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28
2fc: 5c 84 28 3e rotlw r4,r4,r5
300: 7c 63 20 14 addc r3,r3,r4
304: 7c 63 01 94 addze r3,r3
308: 4e 80 00 20 blr
Seems like only powerpc benefits from a branchless implementation.
Other main architectures like ARM or X86 get better code with
the generic implementation and its branch.
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
2022-03-08 16:12:10 +00:00
|
|
|
#define HAVE_ARCH_CSUM_SHIFT
|
|
|
|
static __always_inline __wsum csum_shift(__wsum sum, int offset)
|
|
|
|
{
|
|
|
|
/* rotate sum to align it with a 16b boundary */
|
|
|
|
return (__force __wsum)rol32((__force u32)sum, (offset & 1) << 3);
|
|
|
|
}
|
|
|
|
|
2015-09-22 14:34:25 +00:00
|
|
|
/*
|
|
|
|
* This is a version of ip_compute_csum() optimized for IP headers,
|
|
|
|
* which always checksum on 4 octet boundaries. ihl is the number
|
|
|
|
* of 32-bit words and is always >= 5.
|
|
|
|
*/
|
|
|
|
static inline __wsum ip_fast_csum_nofold(const void *iph, unsigned int ihl)
|
|
|
|
{
|
|
|
|
const u32 *ptr = (const u32 *)iph + 1;
|
|
|
|
#ifdef __powerpc64__
|
|
|
|
unsigned int i;
|
|
|
|
u64 s = *(const u32 *)iph;
|
|
|
|
|
|
|
|
for (i = 0; i < ihl - 1; i++, ptr++)
|
|
|
|
s += *ptr;
|
2016-11-03 05:10:55 +00:00
|
|
|
return (__force __wsum)from64to32(s);
|
2015-09-22 14:34:25 +00:00
|
|
|
#else
|
|
|
|
__wsum sum, tmp;
|
|
|
|
|
|
|
|
asm("mtctr %3;"
|
|
|
|
"addc %0,%4,%5;"
|
|
|
|
"1: lwzu %1, 4(%2);"
|
|
|
|
"adde %0,%0,%1;"
|
|
|
|
"bdnz 1b;"
|
|
|
|
"addze %0,%0;"
|
|
|
|
: "=r" (sum), "=r" (tmp), "+b" (ptr)
|
|
|
|
: "r" (ihl - 2), "r" (*(const u32 *)iph), "r" (*ptr)
|
|
|
|
: "ctr", "xer", "memory");
|
|
|
|
|
|
|
|
return sum;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
|
|
|
|
{
|
|
|
|
return csum_fold(ip_fast_csum_nofold(iph, ihl));
|
|
|
|
}
|
|
|
|
|
2016-03-07 17:44:37 +00:00
|
|
|
/*
|
|
|
|
* computes the checksum of a memory block at buff, length len,
|
|
|
|
* and adds in "sum" (32-bit)
|
|
|
|
*
|
|
|
|
* returns a 32-bit number suitable for feeding into itself
|
|
|
|
* or csum_tcpudp_magic
|
|
|
|
*
|
|
|
|
* this function must be called with even lengths, except
|
|
|
|
* for the last fragment, which may be odd
|
|
|
|
*
|
|
|
|
* it's best to have buff aligned on a 32-bit boundary
|
|
|
|
*/
|
|
|
|
__wsum __csum_partial(const void *buff, int len, __wsum sum);
|
|
|
|
|
2020-10-15 10:52:20 +00:00
|
|
|
static __always_inline __wsum csum_partial(const void *buff, int len, __wsum sum)
|
2016-03-07 17:44:37 +00:00
|
|
|
{
|
|
|
|
if (__builtin_constant_p(len) && len <= 16 && (len & 1) == 0) {
|
|
|
|
if (len == 2)
|
|
|
|
sum = csum_add(sum, (__force __wsum)*(const u16 *)buff);
|
|
|
|
if (len >= 4)
|
|
|
|
sum = csum_add(sum, (__force __wsum)*(const u32 *)buff);
|
|
|
|
if (len == 6)
|
|
|
|
sum = csum_add(sum, (__force __wsum)
|
|
|
|
*(const u16 *)(buff + 4));
|
|
|
|
if (len >= 8)
|
|
|
|
sum = csum_add(sum, (__force __wsum)
|
|
|
|
*(const u32 *)(buff + 4));
|
|
|
|
if (len == 10)
|
|
|
|
sum = csum_add(sum, (__force __wsum)
|
|
|
|
*(const u16 *)(buff + 8));
|
|
|
|
if (len >= 12)
|
|
|
|
sum = csum_add(sum, (__force __wsum)
|
|
|
|
*(const u32 *)(buff + 8));
|
|
|
|
if (len == 14)
|
|
|
|
sum = csum_add(sum, (__force __wsum)
|
|
|
|
*(const u16 *)(buff + 12));
|
|
|
|
if (len >= 16)
|
|
|
|
sum = csum_add(sum, (__force __wsum)
|
|
|
|
*(const u32 *)(buff + 12));
|
|
|
|
} else if (__builtin_constant_p(len) && (len & 3) == 0) {
|
|
|
|
sum = csum_add(sum, ip_fast_csum_nofold(buff, len >> 2));
|
|
|
|
} else {
|
|
|
|
sum = __csum_partial(buff, len, sum);
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* this routine is used for miscellaneous IP-like checksums, mainly
|
|
|
|
* in icmp.c
|
|
|
|
*/
|
|
|
|
static inline __sum16 ip_compute_csum(const void *buff, int len)
|
|
|
|
{
|
|
|
|
return csum_fold(csum_partial(buff, len, 0));
|
|
|
|
}
|
|
|
|
|
powerpc: Implement csum_ipv6_magic in assembly
The generic csum_ipv6_magic() generates a pretty bad result
00000000 <csum_ipv6_magic>: (PPC32)
0: 81 23 00 00 lwz r9,0(r3)
4: 81 03 00 04 lwz r8,4(r3)
8: 7c e7 4a 14 add r7,r7,r9
c: 7d 29 38 10 subfc r9,r9,r7
10: 7d 4a 51 10 subfe r10,r10,r10
14: 7d 27 42 14 add r9,r7,r8
18: 7d 2a 48 50 subf r9,r10,r9
1c: 80 e3 00 08 lwz r7,8(r3)
20: 7d 08 48 10 subfc r8,r8,r9
24: 7d 4a 51 10 subfe r10,r10,r10
28: 7d 29 3a 14 add r9,r9,r7
2c: 81 03 00 0c lwz r8,12(r3)
30: 7d 2a 48 50 subf r9,r10,r9
34: 7c e7 48 10 subfc r7,r7,r9
38: 7d 4a 51 10 subfe r10,r10,r10
3c: 7d 29 42 14 add r9,r9,r8
40: 7d 2a 48 50 subf r9,r10,r9
44: 80 e4 00 00 lwz r7,0(r4)
48: 7d 08 48 10 subfc r8,r8,r9
4c: 7d 4a 51 10 subfe r10,r10,r10
50: 7d 29 3a 14 add r9,r9,r7
54: 7d 2a 48 50 subf r9,r10,r9
58: 81 04 00 04 lwz r8,4(r4)
5c: 7c e7 48 10 subfc r7,r7,r9
60: 7d 4a 51 10 subfe r10,r10,r10
64: 7d 29 42 14 add r9,r9,r8
68: 7d 2a 48 50 subf r9,r10,r9
6c: 80 e4 00 08 lwz r7,8(r4)
70: 7d 08 48 10 subfc r8,r8,r9
74: 7d 4a 51 10 subfe r10,r10,r10
78: 7d 29 3a 14 add r9,r9,r7
7c: 7d 2a 48 50 subf r9,r10,r9
80: 81 04 00 0c lwz r8,12(r4)
84: 7c e7 48 10 subfc r7,r7,r9
88: 7d 4a 51 10 subfe r10,r10,r10
8c: 7d 29 42 14 add r9,r9,r8
90: 7d 2a 48 50 subf r9,r10,r9
94: 7d 08 48 10 subfc r8,r8,r9
98: 7d 4a 51 10 subfe r10,r10,r10
9c: 7d 29 2a 14 add r9,r9,r5
a0: 7d 2a 48 50 subf r9,r10,r9
a4: 7c a5 48 10 subfc r5,r5,r9
a8: 7c 63 19 10 subfe r3,r3,r3
ac: 7d 29 32 14 add r9,r9,r6
b0: 7d 23 48 50 subf r9,r3,r9
b4: 7c c6 48 10 subfc r6,r6,r9
b8: 7c 63 19 10 subfe r3,r3,r3
bc: 7c 63 48 50 subf r3,r3,r9
c0: 54 6a 80 3e rotlwi r10,r3,16
c4: 7c 63 52 14 add r3,r3,r10
c8: 7c 63 18 f8 not r3,r3
cc: 54 63 84 3e rlwinm r3,r3,16,16,31
d0: 4e 80 00 20 blr
0000000000000000 <.csum_ipv6_magic>: (PPC64)
0: 81 23 00 00 lwz r9,0(r3)
4: 80 03 00 04 lwz r0,4(r3)
8: 81 63 00 08 lwz r11,8(r3)
c: 7c e7 4a 14 add r7,r7,r9
10: 7f 89 38 40 cmplw cr7,r9,r7
14: 7d 47 02 14 add r10,r7,r0
18: 7d 30 10 26 mfocrf r9,1
1c: 55 29 f7 fe rlwinm r9,r9,30,31,31
20: 7d 4a 4a 14 add r10,r10,r9
24: 7f 80 50 40 cmplw cr7,r0,r10
28: 7d 2a 5a 14 add r9,r10,r11
2c: 80 03 00 0c lwz r0,12(r3)
30: 81 44 00 00 lwz r10,0(r4)
34: 7d 10 10 26 mfocrf r8,1
38: 55 08 f7 fe rlwinm r8,r8,30,31,31
3c: 7d 29 42 14 add r9,r9,r8
40: 81 04 00 04 lwz r8,4(r4)
44: 7f 8b 48 40 cmplw cr7,r11,r9
48: 7d 29 02 14 add r9,r9,r0
4c: 7d 70 10 26 mfocrf r11,1
50: 55 6b f7 fe rlwinm r11,r11,30,31,31
54: 7d 29 5a 14 add r9,r9,r11
58: 7f 80 48 40 cmplw cr7,r0,r9
5c: 7d 29 52 14 add r9,r9,r10
60: 7c 10 10 26 mfocrf r0,1
64: 54 00 f7 fe rlwinm r0,r0,30,31,31
68: 7d 69 02 14 add r11,r9,r0
6c: 7f 8a 58 40 cmplw cr7,r10,r11
70: 7c 0b 42 14 add r0,r11,r8
74: 81 44 00 08 lwz r10,8(r4)
78: 7c f0 10 26 mfocrf r7,1
7c: 54 e7 f7 fe rlwinm r7,r7,30,31,31
80: 7c 00 3a 14 add r0,r0,r7
84: 7f 88 00 40 cmplw cr7,r8,r0
88: 7d 20 52 14 add r9,r0,r10
8c: 80 04 00 0c lwz r0,12(r4)
90: 7d 70 10 26 mfocrf r11,1
94: 55 6b f7 fe rlwinm r11,r11,30,31,31
98: 7d 29 5a 14 add r9,r9,r11
9c: 7f 8a 48 40 cmplw cr7,r10,r9
a0: 7d 29 02 14 add r9,r9,r0
a4: 7d 70 10 26 mfocrf r11,1
a8: 55 6b f7 fe rlwinm r11,r11,30,31,31
ac: 7d 29 5a 14 add r9,r9,r11
b0: 7f 80 48 40 cmplw cr7,r0,r9
b4: 7d 29 2a 14 add r9,r9,r5
b8: 7c 10 10 26 mfocrf r0,1
bc: 54 00 f7 fe rlwinm r0,r0,30,31,31
c0: 7d 29 02 14 add r9,r9,r0
c4: 7f 85 48 40 cmplw cr7,r5,r9
c8: 7c 09 32 14 add r0,r9,r6
cc: 7d 50 10 26 mfocrf r10,1
d0: 55 4a f7 fe rlwinm r10,r10,30,31,31
d4: 7c 00 52 14 add r0,r0,r10
d8: 7f 80 30 40 cmplw cr7,r0,r6
dc: 7d 30 10 26 mfocrf r9,1
e0: 55 29 ef fe rlwinm r9,r9,29,31,31
e4: 7c 09 02 14 add r0,r9,r0
e8: 54 03 80 3e rotlwi r3,r0,16
ec: 7c 03 02 14 add r0,r3,r0
f0: 7c 03 00 f8 not r3,r0
f4: 78 63 84 22 rldicl r3,r3,48,48
f8: 4e 80 00 20 blr
This patch implements it in assembly for both PPC32 and PPC64
Link: https://github.com/linuxppc/linux/issues/9
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Reviewed-by: Segher Boessenkool <segher@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
2018-05-24 11:33:18 +00:00
|
|
|
#define _HAVE_ARCH_IPV6_CSUM
|
|
|
|
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
|
|
|
|
const struct in6_addr *daddr,
|
|
|
|
__u32 len, __u8 proto, __wsum sum);
|
|
|
|
|
2005-12-16 21:43:46 +00:00
|
|
|
#endif /* __KERNEL__ */
|
2005-10-20 16:44:46 +00:00
|
|
|
#endif
|