nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
|
|
|
|
|
|
|
/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
|
|
|
|
*
|
|
|
|
* Copyright (c) 2019-2020 Red Hat GmbH
|
|
|
|
*
|
|
|
|
* Author: Stefano Brivio <sbrivio@redhat.com>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/netlink.h>
|
|
|
|
#include <linux/netfilter.h>
|
|
|
|
#include <linux/netfilter/nf_tables.h>
|
|
|
|
#include <net/netfilter/nf_tables_core.h>
|
|
|
|
#include <uapi/linux/netfilter/nf_tables.h>
|
|
|
|
#include <linux/bitmap.h>
|
|
|
|
#include <linux/bitops.h>
|
|
|
|
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <asm/fpu/api.h>
|
|
|
|
|
|
|
|
#include "nft_set_pipapo_avx2.h"
|
|
|
|
#include "nft_set_pipapo.h"
|
|
|
|
|
|
|
|
#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
|
|
|
|
|
|
|
|
/* Load from memory into YMM register with non-temporal hint ("stream load"),
|
|
|
|
* that is, don't fetch lines from memory into the cache. This avoids pushing
|
|
|
|
* precious packet data out of the cache hierarchy, and is appropriate when:
|
|
|
|
*
|
|
|
|
* - loading buckets from lookup tables, as they are not going to be used
|
|
|
|
* again before packets are entirely classified
|
|
|
|
*
|
|
|
|
* - loading the result bitmap from the previous field, as it's never used
|
|
|
|
* again
|
|
|
|
*/
|
|
|
|
#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
|
|
|
|
asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
|
|
|
|
|
|
|
|
/* Stream a single lookup table bucket into YMM register given lookup table,
|
|
|
|
* group index, value of packet bits, bucket size.
|
|
|
|
*/
|
|
|
|
#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(reg, \
|
|
|
|
lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
|
|
|
|
(v)) * (bsize)])
|
|
|
|
#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(reg, \
|
|
|
|
lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
|
|
|
|
(v)) * (bsize)])
|
|
|
|
|
|
|
|
/* Bitwise AND: the staple operation of this algorithm */
|
|
|
|
#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
|
|
|
|
asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
|
|
|
|
|
|
|
|
/* Jump to label if @reg is zero */
|
|
|
|
#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
|
work around gcc bugs with 'asm goto' with outputs
commit 4356e9f841f7fbb945521cef3577ba394c65f3fc upstream.
We've had issues with gcc and 'asm goto' before, and we created a
'asm_volatile_goto()' macro for that in the past: see commits
3f0116c3238a ("compiler/gcc4: Add quirk for 'asm goto' miscompilation
bug") and a9f180345f53 ("compiler/gcc4: Make quirk for
asm_volatile_goto() unconditional").
Then, much later, we ended up removing the workaround in commit
43c249ea0b1e ("compiler-gcc.h: remove ancient workaround for gcc PR
58670") because we no longer supported building the kernel with the
affected gcc versions, but we left the macro uses around.
Now, Sean Christopherson reports a new version of a very similar
problem, which is fixed by re-applying that ancient workaround. But the
problem in question is limited to only the 'asm goto with outputs'
cases, so instead of re-introducing the old workaround as-is, let's
rename and limit the workaround to just that much less common case.
It looks like there are at least two separate issues that all hit in
this area:
(a) some versions of gcc don't mark the asm goto as 'volatile' when it
has outputs:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98619
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110420
which is easy to work around by just adding the 'volatile' by hand.
(b) Internal compiler errors:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110422
which are worked around by adding the extra empty 'asm' as a
barrier, as in the original workaround.
but the problem Sean sees may be a third thing since it involves bad
code generation (not an ICE) even with the manually added 'volatile'.
but the same old workaround works for this case, even if this feels a
bit like voodoo programming and may only be hiding the issue.
Reported-and-tested-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/all/20240208220604.140859-1-seanjc@google.com/
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Uros Bizjak <ubizjak@gmail.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Cc: Andrew Pinski <quic_apinski@quicinc.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2024-02-09 20:39:31 +00:00
|
|
|
asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
"je %l[" #label "]" : : : : label)
|
|
|
|
|
|
|
|
/* Store 256 bits from YMM register into memory. Contrary to bucket load
|
|
|
|
* operation, we don't bypass the cache here, as stored matching results
|
|
|
|
* are always used shortly after.
|
|
|
|
*/
|
|
|
|
#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
|
|
|
|
asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
|
|
|
|
|
|
|
|
/* Zero out a complete YMM register, @reg */
|
|
|
|
#define NFT_PIPAPO_AVX2_ZERO(reg) \
|
|
|
|
asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_prepare() - Prepare before main algorithm body
|
|
|
|
*
|
|
|
|
* This zeroes out ymm15, which is later used whenever we need to clear a
|
|
|
|
* memory location, by storing its content into memory.
|
|
|
|
*/
|
|
|
|
static void nft_pipapo_avx2_prepare(void)
|
|
|
|
{
|
|
|
|
NFT_PIPAPO_AVX2_ZERO(15);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_fill() - Fill a bitmap region with ones
|
|
|
|
* @data: Base memory area
|
|
|
|
* @start: First bit to set
|
|
|
|
* @len: Count of bits to fill
|
|
|
|
*
|
|
|
|
* This is nothing else than a version of bitmap_set(), as used e.g. by
|
|
|
|
* pipapo_refill(), tailored for the microarchitectures using it and better
|
|
|
|
* suited for the specific usage: it's very likely that we'll set a small number
|
|
|
|
* of bits, not crossing a word boundary, and correct branch prediction is
|
|
|
|
* critical here.
|
|
|
|
*
|
|
|
|
* This function doesn't actually use any AVX2 instruction.
|
|
|
|
*/
|
|
|
|
static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
|
|
|
|
{
|
|
|
|
int offset = start % BITS_PER_LONG;
|
|
|
|
unsigned long mask;
|
|
|
|
|
|
|
|
data += start / BITS_PER_LONG;
|
|
|
|
|
|
|
|
if (likely(len == 1)) {
|
|
|
|
*data |= BIT(offset);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (likely(len < BITS_PER_LONG || offset)) {
|
|
|
|
if (likely(len + offset <= BITS_PER_LONG)) {
|
|
|
|
*data |= GENMASK(len - 1 + offset, offset);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
*data |= ~0UL << offset;
|
|
|
|
len -= BITS_PER_LONG - offset;
|
|
|
|
data++;
|
|
|
|
|
|
|
|
if (len <= BITS_PER_LONG) {
|
|
|
|
mask = ~0UL >> (BITS_PER_LONG - len);
|
|
|
|
*data |= mask;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memset(data, 0xff, len / BITS_PER_BYTE);
|
|
|
|
data += len / BITS_PER_LONG;
|
|
|
|
|
|
|
|
len %= BITS_PER_LONG;
|
|
|
|
if (len)
|
|
|
|
*data |= ~0UL >> (BITS_PER_LONG - len);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
|
|
|
|
* @offset: Start from given bitmap (equivalent to bucket) offset, in longs
|
|
|
|
* @map: Bitmap to be scanned for set bits
|
|
|
|
* @dst: Destination bitmap
|
|
|
|
* @mt: Mapping table containing bit set specifiers
|
|
|
|
* @last: Return index of first set bit, if this is the last field
|
|
|
|
*
|
|
|
|
* This is an alternative implementation of pipapo_refill() suitable for usage
|
|
|
|
* with AVX2 lookup routines: we know there are four words to be scanned, at
|
|
|
|
* a given offset inside the map, for each matching iteration.
|
|
|
|
*
|
|
|
|
* This function doesn't actually use any AVX2 instruction.
|
|
|
|
*
|
|
|
|
* Return: first set bit index if @last, index of first filled word otherwise.
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
|
|
|
|
unsigned long *dst,
|
|
|
|
union nft_pipapo_map_bucket *mt, bool last)
|
|
|
|
{
|
|
|
|
int ret = -1;
|
|
|
|
|
|
|
|
#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
|
|
|
|
do { \
|
|
|
|
while (map[(x)]) { \
|
|
|
|
int r = __builtin_ctzl(map[(x)]); \
|
|
|
|
int i = (offset + (x)) * BITS_PER_LONG + r; \
|
|
|
|
\
|
|
|
|
if (last) \
|
|
|
|
return i; \
|
|
|
|
\
|
|
|
|
nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
|
|
|
|
\
|
|
|
|
if (ret == -1) \
|
|
|
|
ret = mt[i].to; \
|
|
|
|
\
|
|
|
|
map[(x)] &= ~(1UL << r); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
|
|
|
|
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
|
|
|
|
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
|
|
|
|
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
|
|
|
|
#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* Load buckets from lookup table corresponding to the values of each 4-bit
|
|
|
|
* group of packet bytes, and perform a bitwise intersection between them. If
|
|
|
|
* this is the first field in the set, simply AND the buckets together
|
|
|
|
* (equivalent to using an all-ones starting bitmap), use the provided starting
|
|
|
|
* bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
|
|
|
|
* working bitmap, @fill.
|
|
|
|
*
|
|
|
|
* This is used for 8-bit fields (i.e. protocol numbers).
|
|
|
|
*
|
|
|
|
* Out-of-order (and superscalar) execution is vital here, so it's critical to
|
|
|
|
* avoid false data dependencies. CPU and compiler could (mostly) take care of
|
|
|
|
* this on their own, but the operation ordering is explicitly given here with
|
|
|
|
* a likely execution order in mind, to highlight possible stalls. That's why
|
|
|
|
* a number of logically distinct operations (i.e. loading buckets, intersecting
|
|
|
|
* buckets) are interleaved.
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
} else {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
|
|
|
|
NFT_PIPAPO_AVX2_AND(3, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 2, 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 16-bit fields (i.e. ports).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_AND(7, 4, 5);
|
|
|
|
} else {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 0, 1);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_AND(7, 4, 5);
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(7, 6, 7);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 32-bit fields (i.e. IPv4 addresses).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
|
|
|
|
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
|
|
|
|
};
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(8, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_AND(9, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(11, 6, 7);
|
|
|
|
NFT_PIPAPO_AVX2_AND(12, 8, 9);
|
|
|
|
NFT_PIPAPO_AVX2_AND(13, 10, 11);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 12, 13);
|
|
|
|
} else {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(8, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(10, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(12, 6, 7);
|
|
|
|
NFT_PIPAPO_AVX2_AND(13, 8, 9);
|
|
|
|
NFT_PIPAPO_AVX2_AND(14, 10, 11);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 12, 13);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 1, 14);
|
|
|
|
}
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
|
|
|
|
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
|
|
|
|
pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
|
|
|
|
};
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (!first)
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
|
|
|
|
|
|
|
|
if (!first) {
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 1, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(9, 1, 4);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(11, 5, 6);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(13, 7, 8);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 9, 10);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 11, 12);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 13, 14);
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 0, 1);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 2, 3);
|
|
|
|
|
|
|
|
/* Stalls */
|
|
|
|
NFT_PIPAPO_AVX2_AND(7, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_AND(8, 6, 7);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 128-bit fields (i.e. IPv6 addresses).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
|
|
|
|
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
|
|
|
|
pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
|
|
|
|
pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
|
|
|
|
pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
|
|
|
|
pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
|
|
|
|
pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
|
|
|
|
pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
|
|
|
|
};
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (!first)
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
|
|
|
|
if (!first) {
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 1, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(8, 1, 4);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(10, 5, 6);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(12, 7, 8);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(14, 9, 10);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 11, 12);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 13, 14);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(7, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(9, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(11, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(13, 6, 7);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 8, 9);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 10, 11);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 12, 13);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 14, 0);
|
|
|
|
NFT_PIPAPO_AVX2_AND(7, 1, 2);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(9, 3, 4);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(11, 5, 6);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(13, 7, 8);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 9, 10);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 11, 12);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 13, 14);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(9, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(11, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 6, 7);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 8, 9);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 10, 11);
|
|
|
|
NFT_PIPAPO_AVX2_AND(3, 12, 0);
|
|
|
|
|
|
|
|
/* Stalls */
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 1, 2);
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 3, 4);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 8-bit fields (i.e. protocol numbers).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
|
|
|
|
} else {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
|
|
|
|
}
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 16-bit fields (i.e. ports).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
} else {
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(3, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 3, 2);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 32-bit fields (i.e. IPv4 addresses).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 4, 5);
|
|
|
|
} else {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 2, 3);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(7, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 6, 7);
|
|
|
|
}
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 0, 1);
|
2021-11-27 10:33:37 +00:00
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize);
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
NFT_PIPAPO_AVX2_AND(7, 2, 3);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 6, 7);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
} else {
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 6, 7);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(3, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 2, 3);
|
|
|
|
}
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* See nft_pipapo_avx2_lookup_4b_2().
|
|
|
|
*
|
|
|
|
* This is used for 128-bit fields (i.e. IPv6 addresses).
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
|
|
|
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
|
|
|
|
unsigned long *lt = f->lt, bsize = f->bsize;
|
|
|
|
|
|
|
|
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
|
|
|
|
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
|
|
|
|
|
|
|
|
if (!first)
|
|
|
|
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
|
|
|
|
if (!first) {
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
|
|
|
|
NFT_PIPAPO_AVX2_AND(1, 1, 0);
|
|
|
|
}
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 1, 2);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 3, 4);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(3, 5, 6);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 6, 7);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(0, 4, 5);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(2, 6, 7);
|
|
|
|
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
|
|
|
|
NFT_PIPAPO_AVX2_AND(4, 0, 1);
|
|
|
|
|
|
|
|
/* Stall */
|
|
|
|
NFT_PIPAPO_AVX2_AND(5, 2, 3);
|
|
|
|
NFT_PIPAPO_AVX2_AND(6, 4, 5);
|
|
|
|
|
|
|
|
NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
|
|
|
|
|
|
|
|
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (unlikely(ret == -1))
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
nomatch:
|
|
|
|
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
|
|
|
|
nothing:
|
|
|
|
;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
|
|
|
|
* @map: Previous match result, used as initial bitmap
|
|
|
|
* @fill: Destination bitmap to be filled with current match result
|
|
|
|
* @f: Field, containing lookup and mapping tables
|
|
|
|
* @offset: Ignore buckets before the given index, no bits are filled there
|
|
|
|
* @pkt: Packet data, pointer to input nftables register
|
|
|
|
* @first: If this is the first field, don't source previous result
|
|
|
|
* @last: Last field: stop at the first match and return bit index
|
|
|
|
*
|
|
|
|
* This function should never be called, but is provided for the case the field
|
|
|
|
* size doesn't match any of the known data types. Matching rate is
|
|
|
|
* substantially lower than AVX2 routines.
|
|
|
|
*
|
|
|
|
* Return: -1 on no match, rule index of match if @last, otherwise first long
|
|
|
|
* word index to be checked next (i.e. first filled word).
|
|
|
|
*/
|
|
|
|
static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
|
|
|
|
struct nft_pipapo_field *f, int offset,
|
|
|
|
const u8 *pkt, bool first, bool last)
|
|
|
|
{
|
2021-12-21 19:37:57 +00:00
|
|
|
unsigned long bsize = f->bsize;
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
int i, ret = -1, b;
|
|
|
|
|
|
|
|
if (first)
|
|
|
|
memset(map, 0xff, bsize * sizeof(*map));
|
|
|
|
|
|
|
|
for (i = offset; i < bsize; i++) {
|
|
|
|
if (f->bb == 8)
|
|
|
|
pipapo_and_field_buckets_8bit(f, map, pkt);
|
|
|
|
else
|
|
|
|
pipapo_and_field_buckets_4bit(f, map, pkt);
|
|
|
|
NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
|
|
|
|
|
|
|
|
b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
|
|
|
|
|
|
|
|
if (last)
|
|
|
|
return b;
|
|
|
|
|
|
|
|
if (ret == -1)
|
|
|
|
ret = b / XSAVE_YMM_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
|
|
|
|
* @desc: Set description, element count and field description used
|
|
|
|
* @features: Flags: NFT_SET_INTERVAL needs to be there
|
|
|
|
* @est: Storage for estimation data
|
|
|
|
*
|
|
|
|
* Return: true if set is compatible and AVX2 available, false otherwise.
|
|
|
|
*/
|
|
|
|
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
|
|
|
|
struct nft_set_estimate *est)
|
|
|
|
{
|
nft_set_pipapo: Prepare for single ranged field usage
A few adjustments in nft_pipapo_init() are needed to allow usage of
this set back-end for a single, ranged field.
Provide a convenient NFT_PIPAPO_MIN_FIELDS definition that currently
makes sure that the rbtree back-end is selected instead, for sets
with a single field.
This finally allows a fair comparison with rbtree sets, by defining
NFT_PIPAPO_MIN_FIELDS as 0 and skipping rbtree back-end initialisation:
---------------.--------------------------.-------------------------.
AMD Epyc 7402 | baselines, Mpps | Mpps, % over rbtree |
1 thread |__________________________|_________________________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | pipapo |
---------------| hook | no | single | pipapo |single field|
type entries | drop | ranges | field |single field| AVX2 |
---------------|--------|--------|--------|------------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 6.0 +58% | 9.6 +153% |
---------------|--------|--------|--------|------------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 9.1 +57% |11.6 +100% |
---------------|--------|--------|--------|------------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.8 +55% | 6.5 +261% |
---------------|--------|--------|--------|------------|------------|
port,proto | | | | [1] | [1] |
30000 | 19.6 | 11.6 | 3.9 | 0.9 -77% | 2.7 -31% |
---------------|--------|--------|--------|------------|------------|
port,proto | | | | | |
10000 | 19.6 | 11.6 | 4.4 | 2.1 -52% | 5.6 +27% |
---------------|--------|--------|--------|------------|------------|
port,proto | | | | | |
4 threads 10000| 77.9 | 45.1 | 17.4 | 8.3 -52% |22.4 +29% |
---------------|--------|--------|--------|------------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 4.5 +5% | 8.2 +91% |
---------------|--------|--------|--------|------------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 2.8 +47% | 6.6 +247% |
---------------|--------|--------|--------|------------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 6.0 +54% | 9.9 +154% |
---------------'--------'--------'--------'------------'------------'
[1] Causes switch of lookup table buckets for 'port' to 4-bit groups
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:37 +00:00
|
|
|
if (!(features & NFT_SET_INTERVAL) ||
|
|
|
|
desc->field_count < NFT_PIPAPO_MIN_FIELDS)
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
est->size = pipapo_estimate_size(desc);
|
|
|
|
if (!est->size)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
est->lookup = NFT_SET_CLASS_O_LOG_N;
|
|
|
|
|
|
|
|
est->space = NFT_SET_CLASS_O_N;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
|
|
|
|
* @net: Network namespace
|
|
|
|
* @set: nftables API set representation
|
2021-05-29 16:50:45 +00:00
|
|
|
* @key: nftables API element representation containing key data
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
* @ext: nftables API extension pointer, filled with matching reference
|
|
|
|
*
|
|
|
|
* For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
|
|
|
|
*
|
|
|
|
* This implementation exploits the repetitive characteristic of the algorithm
|
|
|
|
* to provide a fast, vectorised version using the AVX2 SIMD instruction set.
|
|
|
|
*
|
|
|
|
* Return: true on match, false otherwise.
|
|
|
|
*/
|
|
|
|
bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
|
|
|
|
const u32 *key, const struct nft_set_ext **ext)
|
|
|
|
{
|
|
|
|
struct nft_pipapo *priv = nft_set_priv(set);
|
netfilter: nft_set_pipapo: store index in scratch maps
[ Upstream commit 76313d1a4aa9e30d5b43dee5efd8bcd4d8250006 ]
Pipapo needs a scratchpad area to keep state during matching.
This state can be large and thus cannot reside on stack.
Each set preallocates percpu areas for this.
On each match stage, one scratchpad half starts with all-zero and the other
is inited to all-ones.
At the end of each stage, the half that starts with all-ones is
always zero. Before next field is tested, pointers to the two halves
are swapped, i.e. resmap pointer turns into fill pointer and vice versa.
After the last field has been processed, pipapo stashes the
index toggle in a percpu variable, with assumption that next packet
will start with the all-zero half and sets all bits in the other to 1.
This isn't reliable.
There can be multiple sets and we can't be sure that the upper
and lower half of all set scratch map is always in sync (lookups
can be conditional), so one set might have swapped, but other might
not have been queried.
Thus we need to keep the index per-set-and-cpu, just like the
scratchpad.
Note that this bug fix is incomplete, there is a related issue.
avx2 and normal implementation might use slightly different areas of the
map array space due to the avx2 alignment requirements, so
m->scratch (generic/fallback implementation) and ->scratch_aligned
(avx) may partially overlap. scratch and scratch_aligned are not distinct
objects, the latter is just the aligned address of the former.
After this change, write to scratch_align->map_index may write to
scratch->map, so this issue becomes more prominent, we can set to 1
a bit in the supposedly-all-zero area of scratch->map[].
A followup patch will remove the scratch_aligned and makes generic and
avx code use the same (aligned) area.
Its done in a separate change to ease review.
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-07 20:52:46 +00:00
|
|
|
struct nft_pipapo_scratch *scratch;
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
u8 genmask = nft_genmask_cur(net);
|
|
|
|
const u8 *rp = (const u8 *)key;
|
|
|
|
struct nft_pipapo_match *m;
|
|
|
|
struct nft_pipapo_field *f;
|
netfilter: nft_set_pipapo: store index in scratch maps
[ Upstream commit 76313d1a4aa9e30d5b43dee5efd8bcd4d8250006 ]
Pipapo needs a scratchpad area to keep state during matching.
This state can be large and thus cannot reside on stack.
Each set preallocates percpu areas for this.
On each match stage, one scratchpad half starts with all-zero and the other
is inited to all-ones.
At the end of each stage, the half that starts with all-ones is
always zero. Before next field is tested, pointers to the two halves
are swapped, i.e. resmap pointer turns into fill pointer and vice versa.
After the last field has been processed, pipapo stashes the
index toggle in a percpu variable, with assumption that next packet
will start with the all-zero half and sets all bits in the other to 1.
This isn't reliable.
There can be multiple sets and we can't be sure that the upper
and lower half of all set scratch map is always in sync (lookups
can be conditional), so one set might have swapped, but other might
not have been queried.
Thus we need to keep the index per-set-and-cpu, just like the
scratchpad.
Note that this bug fix is incomplete, there is a related issue.
avx2 and normal implementation might use slightly different areas of the
map array space due to the avx2 alignment requirements, so
m->scratch (generic/fallback implementation) and ->scratch_aligned
(avx) may partially overlap. scratch and scratch_aligned are not distinct
objects, the latter is just the aligned address of the former.
After this change, write to scratch_align->map_index may write to
scratch->map, so this issue becomes more prominent, we can set to 1
a bit in the supposedly-all-zero area of scratch->map[].
A followup patch will remove the scratch_aligned and makes generic and
avx code use the same (aligned) area.
Its done in a separate change to ease review.
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-07 20:52:46 +00:00
|
|
|
unsigned long *res, *fill;
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
bool map_index;
|
|
|
|
int i, ret = 0;
|
|
|
|
|
2021-05-10 05:58:22 +00:00
|
|
|
if (unlikely(!irq_fpu_usable()))
|
|
|
|
return nft_pipapo_lookup(net, set, key, ext);
|
|
|
|
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
m = rcu_dereference(priv->match);
|
|
|
|
|
netfilter: nft_set_pipapo_avx2: Skip LDMXCSR, we don't need a valid MXCSR state
We don't need a valid MXCSR state for the lookup routines, none of
the instructions we use rely on or affect any bit in the MXCSR
register.
Instead of calling kernel_fpu_begin(), we can pass 0 as mask to
kernel_fpu_begin_mask() and spare one LDMXCSR instruction.
Commit 49200d17d27d ("x86/fpu/64: Don't FNINIT in kernel_fpu_begin()")
already speeds up lookups considerably, and by dropping the MCXSR
initialisation we can now get a much smaller, but measurable, increase
in matching rates.
The table below reports matching rates and a wild approximation of
clock cycles needed for a match in a "port,net" test with 10 entries
from selftests/netfilter/nft_concat_range.sh, limited to the first
field, i.e. the port (with nft_set_rbtree initialisation skipped), run
on a single AMD Epyc 7351 thread (2.9GHz, 512 KiB L1D$, 8 MiB L2$).
The (very rough) estimation of clock cycles is obtained by simply
dividing frequency by matching rate. The "cycles spared" column refers
to the difference in cycles compared to the previous row, and the rate
increase also refers to the previous row. Results are averages of six
runs.
Merely for context, I'm also reporting packet rates obtained by
skipping kernel_fpu_begin() and kernel_fpu_end() altogether (which
shows a very limited impact now), as well as skipping the whole lookup
function, compared to simply counting and dropping all packets using
the netdev hook drop (see nft_concat_range.sh for details). This
workload also includes packet generation with pktgen and the receive
path of veth.
|matching| est. | cycles | rate |
| rate | cycles | spared |increase|
| (Mpps) | | | |
--------------------------------------|--------|--------|--------|--------|
FNINIT, LDMXCSR (before 49200d17d27d) | 5.245 | 553 | - | - |
LDMXCSR only (with 49200d17d27d) | 6.347 | 457 | 96 | 21.0% |
Without LDMXCSR (this patch) | 6.461 | 449 | 8 | 1.8% |
-------- for reference only: ---------|--------|--------|--------|--------|
Without kernel_fpu_begin() | 6.513 | 445 | 4 | 0.8% |
Without actual matching (return true) | 7.649 | 379 | 66 | 17.4% |
Without lookup operation (netdev drop)| 10.320 | 281 | 98 | 34.9% |
The clock cycles spared by avoiding LDMXCSR appear to be in line with CPI
and latency indicated in the manuals of comparable architectures: Intel
Skylake (CPI: 1, latency: 7) and AMD 12h (latency: 12) -- I couldn't find
this information for AMD 17h.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2021-05-10 05:58:52 +00:00
|
|
|
/* This also protects access to all data related to scratch maps.
|
|
|
|
*
|
|
|
|
* Note that we don't need a valid MXCSR state for any of the
|
|
|
|
* operations we use here, so pass 0 as mask and spare a LDMXCSR
|
|
|
|
* instruction.
|
|
|
|
*/
|
|
|
|
kernel_fpu_begin_mask(0);
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
|
2024-02-08 09:31:29 +00:00
|
|
|
scratch = *raw_cpu_ptr(m->scratch);
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
if (unlikely(!scratch)) {
|
|
|
|
kernel_fpu_end();
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
netfilter: nft_set_pipapo: store index in scratch maps
[ Upstream commit 76313d1a4aa9e30d5b43dee5efd8bcd4d8250006 ]
Pipapo needs a scratchpad area to keep state during matching.
This state can be large and thus cannot reside on stack.
Each set preallocates percpu areas for this.
On each match stage, one scratchpad half starts with all-zero and the other
is inited to all-ones.
At the end of each stage, the half that starts with all-ones is
always zero. Before next field is tested, pointers to the two halves
are swapped, i.e. resmap pointer turns into fill pointer and vice versa.
After the last field has been processed, pipapo stashes the
index toggle in a percpu variable, with assumption that next packet
will start with the all-zero half and sets all bits in the other to 1.
This isn't reliable.
There can be multiple sets and we can't be sure that the upper
and lower half of all set scratch map is always in sync (lookups
can be conditional), so one set might have swapped, but other might
not have been queried.
Thus we need to keep the index per-set-and-cpu, just like the
scratchpad.
Note that this bug fix is incomplete, there is a related issue.
avx2 and normal implementation might use slightly different areas of the
map array space due to the avx2 alignment requirements, so
m->scratch (generic/fallback implementation) and ->scratch_aligned
(avx) may partially overlap. scratch and scratch_aligned are not distinct
objects, the latter is just the aligned address of the former.
After this change, write to scratch_align->map_index may write to
scratch->map, so this issue becomes more prominent, we can set to 1
a bit in the supposedly-all-zero area of scratch->map[].
A followup patch will remove the scratch_aligned and makes generic and
avx code use the same (aligned) area.
Its done in a separate change to ease review.
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-07 20:52:46 +00:00
|
|
|
map_index = scratch->map_index;
|
|
|
|
|
|
|
|
res = scratch->map + (map_index ? m->bsize_max : 0);
|
|
|
|
fill = scratch->map + (map_index ? 0 : m->bsize_max);
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
|
|
|
|
/* Starting map doesn't need to be set for this implementation */
|
|
|
|
|
|
|
|
nft_pipapo_avx2_prepare();
|
|
|
|
|
|
|
|
next_match:
|
|
|
|
nft_pipapo_for_each_field(f, i, m) {
|
|
|
|
bool last = i == m->field_count - 1, first = !i;
|
|
|
|
|
|
|
|
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
|
|
|
|
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
|
|
|
|
ret, rp, \
|
|
|
|
first, last))
|
|
|
|
|
|
|
|
if (likely(f->bb == 8)) {
|
|
|
|
if (f->groups == 1) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
|
|
|
|
} else if (f->groups == 2) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
|
|
|
|
} else if (f->groups == 4) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
|
|
|
|
} else if (f->groups == 6) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
|
|
|
|
} else if (f->groups == 16) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
|
|
|
|
} else {
|
|
|
|
ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
|
|
|
|
ret, rp,
|
|
|
|
first, last);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (f->groups == 2) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
|
|
|
|
} else if (f->groups == 4) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
|
|
|
|
} else if (f->groups == 8) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
|
|
|
|
} else if (f->groups == 12) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
|
|
|
|
} else if (f->groups == 32) {
|
|
|
|
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
|
|
|
|
} else {
|
|
|
|
ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
|
|
|
|
ret, rp,
|
|
|
|
first, last);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
|
|
|
|
|
|
|
|
#undef NFT_SET_PIPAPO_AVX2_LOOKUP
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (last) {
|
|
|
|
*ext = &f->mt[ret].e->ext;
|
|
|
|
if (unlikely(nft_set_elem_expired(*ext) ||
|
|
|
|
!nft_set_elem_active(*ext, genmask))) {
|
|
|
|
ret = 0;
|
|
|
|
goto next_match;
|
|
|
|
}
|
|
|
|
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
swap(res, fill);
|
|
|
|
rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (i % 2)
|
netfilter: nft_set_pipapo: store index in scratch maps
[ Upstream commit 76313d1a4aa9e30d5b43dee5efd8bcd4d8250006 ]
Pipapo needs a scratchpad area to keep state during matching.
This state can be large and thus cannot reside on stack.
Each set preallocates percpu areas for this.
On each match stage, one scratchpad half starts with all-zero and the other
is inited to all-ones.
At the end of each stage, the half that starts with all-ones is
always zero. Before next field is tested, pointers to the two halves
are swapped, i.e. resmap pointer turns into fill pointer and vice versa.
After the last field has been processed, pipapo stashes the
index toggle in a percpu variable, with assumption that next packet
will start with the all-zero half and sets all bits in the other to 1.
This isn't reliable.
There can be multiple sets and we can't be sure that the upper
and lower half of all set scratch map is always in sync (lookups
can be conditional), so one set might have swapped, but other might
not have been queried.
Thus we need to keep the index per-set-and-cpu, just like the
scratchpad.
Note that this bug fix is incomplete, there is a related issue.
avx2 and normal implementation might use slightly different areas of the
map array space due to the avx2 alignment requirements, so
m->scratch (generic/fallback implementation) and ->scratch_aligned
(avx) may partially overlap. scratch and scratch_aligned are not distinct
objects, the latter is just the aligned address of the former.
After this change, write to scratch_align->map_index may write to
scratch->map, so this issue becomes more prominent, we can set to 1
a bit in the supposedly-all-zero area of scratch->map[].
A followup patch will remove the scratch_aligned and makes generic and
avx code use the same (aligned) area.
Its done in a separate change to ease review.
Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
2024-02-07 20:52:46 +00:00
|
|
|
scratch->map_index = !map_index;
|
nft_set_pipapo: Introduce AVX2-based lookup implementation
If the AVX2 set is available, we can exploit the repetitive
characteristic of this algorithm to provide a fast, vectorised
version by using 256-bit wide AVX2 operations for bucket loads and
bitwise intersections.
In most cases, this implementation consistently outperforms rbtree
set instances despite the fact they are configured to use a given,
single, ranged data type out of the ones used for performance
measurements by the nft_concat_range.sh kselftest.
That script, injecting packets directly on the ingoing device path
with pktgen, reports, averaged over five runs on a single AMD Epyc
7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below.
CONFIG_RETPOLINE was not set here.
Note that this is not a fair comparison over hash and rbtree set
types: non-ranged entries (used to have a reference for hash types)
would be matched faster than this, and matching on a single field
only (which is the case for rbtree) is also significantly faster.
However, it's not possible at the moment to choose this set type
for non-ranged entries, and the current implementation also needs
a few minor adjustments in order to match on less than two fields.
---------------.-----------------------------------.------------.
AMD Epyc 7402 | baselines, Mpps | this patch |
1 thread |___________________________________|____________|
3.35GHz | | | | | |
768KiB L1D$ | netdev | hash | rbtree | | |
---------------| hook | no | single | | pipapo |
type entries | drop | ranges | field | pipapo | AVX2 |
---------------|--------|--------|--------|--------|------------|
net,port | | | | | |
1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% |
---------------|--------|--------|--------|--------|------------|
port,net | | | | | |
100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% |
---------------|--------|--------|--------|--------|------------|
net6,port | | | | | |
1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% |
---------------|--------|--------|--------|--------|------------|
port,proto | | | | | |
30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac | | | | | |
10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% |
---------------|--------|--------|--------|--------|------------|
net6,port,mac, | | | | | |
proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% |
---------------|--------|--------|--------|--------|------------|
net,mac | | | | | |
1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% |
---------------'--------'--------'--------'--------'------------'
A similar strategy could be easily reused to implement specialised
versions for other SIMD sets, and I plan to post at least a NEON
version at a later time.
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
|
|
|
kernel_fpu_end();
|
|
|
|
|
|
|
|
return ret >= 0;
|
|
|
|
}
|