linux-stable/net/netfilter/nft_set_pipapo_avx2.c

1229 lines
39 KiB
C
Raw Normal View History

nft_set_pipapo: Introduce AVX2-based lookup implementation If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports, averaged over five runs on a single AMD Epyc 7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below. CONFIG_RETPOLINE was not set here. Note that this is not a fair comparison over hash and rbtree set types: non-ranged entries (used to have a reference for hash types) would be matched faster than this, and matching on a single field only (which is the case for rbtree) is also significantly faster. However, it's not possible at the moment to choose this set type for non-ranged entries, and the current implementation also needs a few minor adjustments in order to match on less than two fields. ---------------.-----------------------------------.------------. AMD Epyc 7402 | baselines, Mpps | this patch | 1 thread |___________________________________|____________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | | ---------------| hook | no | single | | pipapo | type entries | drop | ranges | field | pipapo | AVX2 | ---------------|--------|--------|--------|--------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% | ---------------|--------|--------|--------|--------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% | ---------------|--------|--------|--------|--------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% | ---------------|--------|--------|--------|--------|------------| port,proto | | | | | | 30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% | ---------------|--------|--------|--------|--------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% | ---------------|--------|--------|--------|--------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% | ---------------|--------|--------|--------|--------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% | ---------------'--------'--------'--------'--------'------------' A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
// SPDX-License-Identifier: GPL-2.0-only
/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
*
* Copyright (c) 2019-2020 Red Hat GmbH
*
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <asm/fpu/api.h>
#include "nft_set_pipapo_avx2.h"
#include "nft_set_pipapo.h"
#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
/* Load from memory into YMM register with non-temporal hint ("stream load"),
* that is, don't fetch lines from memory into the cache. This avoids pushing
* precious packet data out of the cache hierarchy, and is appropriate when:
*
* - loading buckets from lookup tables, as they are not going to be used
* again before packets are entirely classified
*
* - loading the result bitmap from the previous field, as it's never used
* again
*/
#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
/* Stream a single lookup table bucket into YMM register given lookup table,
* group index, value of packet bits, bucket size.
*/
#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
NFT_PIPAPO_AVX2_LOAD(reg, \
lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
(v)) * (bsize)])
#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
NFT_PIPAPO_AVX2_LOAD(reg, \
lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
(v)) * (bsize)])
/* Bitwise AND: the staple operation of this algorithm */
#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
/* Jump to label if @reg is zero */
#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
"je %l[" #label "]" : : : : label)
/* Store 256 bits from YMM register into memory. Contrary to bucket load
* operation, we don't bypass the cache here, as stored matching results
* are always used shortly after.
*/
#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
/* Zero out a complete YMM register, @reg */
#define NFT_PIPAPO_AVX2_ZERO(reg) \
asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
/* Current working bitmap index, toggled between field matches */
static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
/**
* nft_pipapo_avx2_prepare() - Prepare before main algorithm body
*
* This zeroes out ymm15, which is later used whenever we need to clear a
* memory location, by storing its content into memory.
*/
static void nft_pipapo_avx2_prepare(void)
{
NFT_PIPAPO_AVX2_ZERO(15);
}
/**
* nft_pipapo_avx2_fill() - Fill a bitmap region with ones
* @data: Base memory area
* @start: First bit to set
* @len: Count of bits to fill
*
* This is nothing else than a version of bitmap_set(), as used e.g. by
* pipapo_refill(), tailored for the microarchitectures using it and better
* suited for the specific usage: it's very likely that we'll set a small number
* of bits, not crossing a word boundary, and correct branch prediction is
* critical here.
*
* This function doesn't actually use any AVX2 instruction.
*/
static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
{
int offset = start % BITS_PER_LONG;
unsigned long mask;
data += start / BITS_PER_LONG;
if (likely(len == 1)) {
*data |= BIT(offset);
return;
}
if (likely(len < BITS_PER_LONG || offset)) {
if (likely(len + offset <= BITS_PER_LONG)) {
*data |= GENMASK(len - 1 + offset, offset);
return;
}
*data |= ~0UL << offset;
len -= BITS_PER_LONG - offset;
data++;
if (len <= BITS_PER_LONG) {
mask = ~0UL >> (BITS_PER_LONG - len);
*data |= mask;
return;
}
}
memset(data, 0xff, len / BITS_PER_BYTE);
data += len / BITS_PER_LONG;
len %= BITS_PER_LONG;
if (len)
*data |= ~0UL >> (BITS_PER_LONG - len);
}
/**
* nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
* @offset: Start from given bitmap (equivalent to bucket) offset, in longs
* @map: Bitmap to be scanned for set bits
* @dst: Destination bitmap
* @mt: Mapping table containing bit set specifiers
* @last: Return index of first set bit, if this is the last field
*
* This is an alternative implementation of pipapo_refill() suitable for usage
* with AVX2 lookup routines: we know there are four words to be scanned, at
* a given offset inside the map, for each matching iteration.
*
* This function doesn't actually use any AVX2 instruction.
*
* Return: first set bit index if @last, index of first filled word otherwise.
*/
static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
unsigned long *dst,
union nft_pipapo_map_bucket *mt, bool last)
{
int ret = -1;
#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \
do { \
while (map[(x)]) { \
int r = __builtin_ctzl(map[(x)]); \
int i = (offset + (x)) * BITS_PER_LONG + r; \
\
if (last) \
return i; \
\
nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \
\
if (ret == -1) \
ret = mt[i].to; \
\
map[(x)] &= ~(1UL << r); \
} \
} while (0)
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
return ret;
}
/**
* nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* Load buckets from lookup table corresponding to the values of each 4-bit
* group of packet bytes, and perform a bitwise intersection between them. If
* this is the first field in the set, simply AND the buckets together
* (equivalent to using an all-ones starting bitmap), use the provided starting
* bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
* working bitmap, @fill.
*
* This is used for 8-bit fields (i.e. protocol numbers).
*
* Out-of-order (and superscalar) execution is vital here, so it's critical to
* avoid false data dependencies. CPU and compiler could (mostly) take care of
* this on their own, but the operation ordering is explicitly given here with
* a likely execution order in mind, to highlight possible stalls. That's why
* a number of logically distinct operations (i.e. loading buckets, intersecting
* buckets) are interleaved.
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (first) {
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_AND(4, 0, 1);
} else {
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
NFT_PIPAPO_AVX2_AND(3, 0, 1);
NFT_PIPAPO_AVX2_AND(4, 2, 3);
}
NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 16-bit fields (i.e. ports).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (first) {
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
NFT_PIPAPO_AVX2_AND(4, 0, 1);
NFT_PIPAPO_AVX2_AND(5, 2, 3);
NFT_PIPAPO_AVX2_AND(7, 4, 5);
} else {
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
NFT_PIPAPO_AVX2_AND(7, 4, 5);
/* Stall */
NFT_PIPAPO_AVX2_AND(7, 6, 7);
}
/* Stall */
NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 32-bit fields (i.e. IPv4 addresses).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
};
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (first) {
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize);
NFT_PIPAPO_AVX2_AND(8, 2, 3);
NFT_PIPAPO_AVX2_AND(9, 4, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
NFT_PIPAPO_AVX2_AND(11, 6, 7);
NFT_PIPAPO_AVX2_AND(12, 8, 9);
NFT_PIPAPO_AVX2_AND(13, 10, 11);
/* Stall */
NFT_PIPAPO_AVX2_AND(1, 12, 13);
} else {
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
NFT_PIPAPO_AVX2_AND(8, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
NFT_PIPAPO_AVX2_AND(10, 4, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
NFT_PIPAPO_AVX2_AND(12, 6, 7);
NFT_PIPAPO_AVX2_AND(13, 8, 9);
NFT_PIPAPO_AVX2_AND(14, 10, 11);
/* Stall */
NFT_PIPAPO_AVX2_AND(1, 12, 13);
NFT_PIPAPO_AVX2_AND(1, 1, 14);
}
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
};
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (!first)
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
if (!first) {
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
NFT_PIPAPO_AVX2_AND(1, 1, 0);
}
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize);
NFT_PIPAPO_AVX2_AND(9, 1, 4);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
NFT_PIPAPO_AVX2_AND(11, 5, 6);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize);
NFT_PIPAPO_AVX2_AND(13, 7, 8);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize);
NFT_PIPAPO_AVX2_AND(0, 9, 10);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize);
NFT_PIPAPO_AVX2_AND(2, 11, 12);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
NFT_PIPAPO_AVX2_AND(4, 13, 14);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
/* Stalls */
NFT_PIPAPO_AVX2_AND(7, 4, 5);
NFT_PIPAPO_AVX2_AND(8, 6, 7);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 128-bit fields (i.e. IPv6 addresses).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf,
pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf,
pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf,
pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf,
pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf,
pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
};
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (!first)
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
if (!first) {
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
NFT_PIPAPO_AVX2_AND(1, 1, 0);
}
NFT_PIPAPO_AVX2_AND(5, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize);
NFT_PIPAPO_AVX2_AND(8, 1, 4);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize);
NFT_PIPAPO_AVX2_AND(10, 5, 6);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
NFT_PIPAPO_AVX2_AND(12, 7, 8);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize);
NFT_PIPAPO_AVX2_AND(14, 9, 10);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize);
NFT_PIPAPO_AVX2_AND(1, 11, 12);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize);
NFT_PIPAPO_AVX2_AND(4, 13, 14);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize);
NFT_PIPAPO_AVX2_AND(7, 0, 1);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize);
NFT_PIPAPO_AVX2_AND(9, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
NFT_PIPAPO_AVX2_AND(11, 4, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
NFT_PIPAPO_AVX2_AND(13, 6, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
NFT_PIPAPO_AVX2_AND(0, 8, 9);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize);
NFT_PIPAPO_AVX2_AND(2, 10, 11);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize);
NFT_PIPAPO_AVX2_AND(4, 12, 13);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize);
NFT_PIPAPO_AVX2_AND(6, 14, 0);
NFT_PIPAPO_AVX2_AND(7, 1, 2);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize);
NFT_PIPAPO_AVX2_AND(9, 3, 4);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
NFT_PIPAPO_AVX2_AND(11, 5, 6);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
NFT_PIPAPO_AVX2_AND(13, 7, 8);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize);
NFT_PIPAPO_AVX2_AND(1, 9, 10);
NFT_PIPAPO_AVX2_AND(2, 11, 12);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize);
NFT_PIPAPO_AVX2_AND(4, 13, 14);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize);
NFT_PIPAPO_AVX2_AND(6, 0, 1);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize);
NFT_PIPAPO_AVX2_AND(9, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
NFT_PIPAPO_AVX2_AND(11, 4, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
NFT_PIPAPO_AVX2_AND(0, 6, 7);
NFT_PIPAPO_AVX2_AND(1, 8, 9);
NFT_PIPAPO_AVX2_AND(2, 10, 11);
NFT_PIPAPO_AVX2_AND(3, 12, 0);
/* Stalls */
NFT_PIPAPO_AVX2_AND(4, 1, 2);
NFT_PIPAPO_AVX2_AND(5, 3, 4);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 8-bit fields (i.e. protocol numbers).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (first) {
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
} else {
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
NFT_PIPAPO_AVX2_AND(2, 0, 1);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
}
NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 16-bit fields (i.e. ports).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (first) {
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
NFT_PIPAPO_AVX2_AND(4, 0, 1);
} else {
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
/* Stall */
NFT_PIPAPO_AVX2_AND(3, 0, 1);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
NFT_PIPAPO_AVX2_AND(4, 3, 2);
}
/* Stall */
NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 32-bit fields (i.e. IPv4 addresses).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (first) {
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
/* Stall */
NFT_PIPAPO_AVX2_AND(4, 0, 1);
NFT_PIPAPO_AVX2_AND(5, 2, 3);
NFT_PIPAPO_AVX2_AND(0, 4, 5);
} else {
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
/* Stall */
NFT_PIPAPO_AVX2_AND(7, 4, 5);
NFT_PIPAPO_AVX2_AND(0, 6, 7);
}
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (first) {
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize);
nft_set_pipapo: Introduce AVX2-based lookup implementation If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports, averaged over five runs on a single AMD Epyc 7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below. CONFIG_RETPOLINE was not set here. Note that this is not a fair comparison over hash and rbtree set types: non-ranged entries (used to have a reference for hash types) would be matched faster than this, and matching on a single field only (which is the case for rbtree) is also significantly faster. However, it's not possible at the moment to choose this set type for non-ranged entries, and the current implementation also needs a few minor adjustments in order to match on less than two fields. ---------------.-----------------------------------.------------. AMD Epyc 7402 | baselines, Mpps | this patch | 1 thread |___________________________________|____________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | | ---------------| hook | no | single | | pipapo | type entries | drop | ranges | field | pipapo | AVX2 | ---------------|--------|--------|--------|--------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% | ---------------|--------|--------|--------|--------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% | ---------------|--------|--------|--------|--------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% | ---------------|--------|--------|--------|--------|------------| port,proto | | | | | | 30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% | ---------------|--------|--------|--------|--------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% | ---------------|--------|--------|--------|--------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% | ---------------|--------|--------|--------|--------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% | ---------------'--------'--------'--------'--------'------------' A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
NFT_PIPAPO_AVX2_AND(7, 2, 3);
/* Stall */
NFT_PIPAPO_AVX2_AND(0, 4, 5);
NFT_PIPAPO_AVX2_AND(1, 6, 7);
NFT_PIPAPO_AVX2_AND(4, 0, 1);
} else {
NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
NFT_PIPAPO_AVX2_AND(5, 0, 1);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize);
NFT_PIPAPO_AVX2_AND(0, 4, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize);
NFT_PIPAPO_AVX2_AND(2, 6, 7);
/* Stall */
NFT_PIPAPO_AVX2_AND(3, 0, 1);
NFT_PIPAPO_AVX2_AND(4, 2, 3);
}
NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* See nft_pipapo_avx2_lookup_4b_2().
*
* This is used for 128-bit fields (i.e. IPv6 addresses).
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
unsigned long *lt = f->lt, bsize = f->bsize;
lt += offset * NFT_PIPAPO_LONGS_PER_M256;
for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
if (!first)
NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize);
if (!first) {
NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
NFT_PIPAPO_AVX2_AND(1, 1, 0);
}
NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize);
NFT_PIPAPO_AVX2_AND(6, 1, 2);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize);
NFT_PIPAPO_AVX2_AND(0, 3, 4);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize);
NFT_PIPAPO_AVX2_AND(3, 5, 6);
NFT_PIPAPO_AVX2_AND(4, 0, 1);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
NFT_PIPAPO_AVX2_AND(0, 4, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
NFT_PIPAPO_AVX2_AND(2, 6, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
NFT_PIPAPO_AVX2_AND(4, 0, 1);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
NFT_PIPAPO_AVX2_AND(0, 4, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
NFT_PIPAPO_AVX2_AND(2, 6, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
NFT_PIPAPO_AVX2_AND(4, 0, 1);
/* Stall */
NFT_PIPAPO_AVX2_AND(5, 2, 3);
NFT_PIPAPO_AVX2_AND(6, 4, 5);
NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
if (last)
return b;
if (unlikely(ret == -1))
ret = b / XSAVE_YMM_SIZE;
continue;
nomatch:
NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
nothing:
;
}
return ret;
}
/**
* nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
* @map: Previous match result, used as initial bitmap
* @fill: Destination bitmap to be filled with current match result
* @f: Field, containing lookup and mapping tables
* @offset: Ignore buckets before the given index, no bits are filled there
* @pkt: Packet data, pointer to input nftables register
* @first: If this is the first field, don't source previous result
* @last: Last field: stop at the first match and return bit index
*
* This function should never be called, but is provided for the case the field
* size doesn't match any of the known data types. Matching rate is
* substantially lower than AVX2 routines.
*
* Return: -1 on no match, rule index of match if @last, otherwise first long
* word index to be checked next (i.e. first filled word).
*/
static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
struct nft_pipapo_field *f, int offset,
const u8 *pkt, bool first, bool last)
{
unsigned long bsize = f->bsize;
nft_set_pipapo: Introduce AVX2-based lookup implementation If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports, averaged over five runs on a single AMD Epyc 7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below. CONFIG_RETPOLINE was not set here. Note that this is not a fair comparison over hash and rbtree set types: non-ranged entries (used to have a reference for hash types) would be matched faster than this, and matching on a single field only (which is the case for rbtree) is also significantly faster. However, it's not possible at the moment to choose this set type for non-ranged entries, and the current implementation also needs a few minor adjustments in order to match on less than two fields. ---------------.-----------------------------------.------------. AMD Epyc 7402 | baselines, Mpps | this patch | 1 thread |___________________________________|____________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | | ---------------| hook | no | single | | pipapo | type entries | drop | ranges | field | pipapo | AVX2 | ---------------|--------|--------|--------|--------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% | ---------------|--------|--------|--------|--------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% | ---------------|--------|--------|--------|--------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% | ---------------|--------|--------|--------|--------|------------| port,proto | | | | | | 30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% | ---------------|--------|--------|--------|--------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% | ---------------|--------|--------|--------|--------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% | ---------------|--------|--------|--------|--------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% | ---------------'--------'--------'--------'--------'------------' A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
int i, ret = -1, b;
if (first)
memset(map, 0xff, bsize * sizeof(*map));
for (i = offset; i < bsize; i++) {
if (f->bb == 8)
pipapo_and_field_buckets_8bit(f, map, pkt);
else
pipapo_and_field_buckets_4bit(f, map, pkt);
NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
if (last)
return b;
if (ret == -1)
ret = b / XSAVE_YMM_SIZE;
}
return ret;
}
/**
* nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
* @desc: Set description, element count and field description used
* @features: Flags: NFT_SET_INTERVAL needs to be there
* @est: Storage for estimation data
*
* Return: true if set is compatible and AVX2 available, false otherwise.
*/
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est)
{
nft_set_pipapo: Prepare for single ranged field usage A few adjustments in nft_pipapo_init() are needed to allow usage of this set back-end for a single, ranged field. Provide a convenient NFT_PIPAPO_MIN_FIELDS definition that currently makes sure that the rbtree back-end is selected instead, for sets with a single field. This finally allows a fair comparison with rbtree sets, by defining NFT_PIPAPO_MIN_FIELDS as 0 and skipping rbtree back-end initialisation: ---------------.--------------------------.-------------------------. AMD Epyc 7402 | baselines, Mpps | Mpps, % over rbtree | 1 thread |__________________________|_________________________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | pipapo | ---------------| hook | no | single | pipapo |single field| type entries | drop | ranges | field |single field| AVX2 | ---------------|--------|--------|--------|------------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 6.0 +58% | 9.6 +153% | ---------------|--------|--------|--------|------------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 9.1 +57% |11.6 +100% | ---------------|--------|--------|--------|------------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.8 +55% | 6.5 +261% | ---------------|--------|--------|--------|------------|------------| port,proto | | | | [1] | [1] | 30000 | 19.6 | 11.6 | 3.9 | 0.9 -77% | 2.7 -31% | ---------------|--------|--------|--------|------------|------------| port,proto | | | | | | 10000 | 19.6 | 11.6 | 4.4 | 2.1 -52% | 5.6 +27% | ---------------|--------|--------|--------|------------|------------| port,proto | | | | | | 4 threads 10000| 77.9 | 45.1 | 17.4 | 8.3 -52% |22.4 +29% | ---------------|--------|--------|--------|------------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 4.5 +5% | 8.2 +91% | ---------------|--------|--------|--------|------------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 2.8 +47% | 6.6 +247% | ---------------|--------|--------|--------|------------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 6.0 +54% | 9.9 +154% | ---------------'--------'--------'--------'------------'------------' [1] Causes switch of lookup table buckets for 'port' to 4-bit groups Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:37 +00:00
if (!(features & NFT_SET_INTERVAL) ||
desc->field_count < NFT_PIPAPO_MIN_FIELDS)
nft_set_pipapo: Introduce AVX2-based lookup implementation If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports, averaged over five runs on a single AMD Epyc 7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below. CONFIG_RETPOLINE was not set here. Note that this is not a fair comparison over hash and rbtree set types: non-ranged entries (used to have a reference for hash types) would be matched faster than this, and matching on a single field only (which is the case for rbtree) is also significantly faster. However, it's not possible at the moment to choose this set type for non-ranged entries, and the current implementation also needs a few minor adjustments in order to match on less than two fields. ---------------.-----------------------------------.------------. AMD Epyc 7402 | baselines, Mpps | this patch | 1 thread |___________________________________|____________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | | ---------------| hook | no | single | | pipapo | type entries | drop | ranges | field | pipapo | AVX2 | ---------------|--------|--------|--------|--------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% | ---------------|--------|--------|--------|--------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% | ---------------|--------|--------|--------|--------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% | ---------------|--------|--------|--------|--------|------------| port,proto | | | | | | 30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% | ---------------|--------|--------|--------|--------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% | ---------------|--------|--------|--------|--------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% | ---------------|--------|--------|--------|--------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% | ---------------'--------'--------'--------'--------'------------' A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
return false;
if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
return false;
est->size = pipapo_estimate_size(desc);
if (!est->size)
return false;
est->lookup = NFT_SET_CLASS_O_LOG_N;
est->space = NFT_SET_CLASS_O_N;
return true;
}
/**
* nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
* @net: Network namespace
* @set: nftables API set representation
* @key: nftables API element representation containing key data
nft_set_pipapo: Introduce AVX2-based lookup implementation If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports, averaged over five runs on a single AMD Epyc 7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below. CONFIG_RETPOLINE was not set here. Note that this is not a fair comparison over hash and rbtree set types: non-ranged entries (used to have a reference for hash types) would be matched faster than this, and matching on a single field only (which is the case for rbtree) is also significantly faster. However, it's not possible at the moment to choose this set type for non-ranged entries, and the current implementation also needs a few minor adjustments in order to match on less than two fields. ---------------.-----------------------------------.------------. AMD Epyc 7402 | baselines, Mpps | this patch | 1 thread |___________________________________|____________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | | ---------------| hook | no | single | | pipapo | type entries | drop | ranges | field | pipapo | AVX2 | ---------------|--------|--------|--------|--------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% | ---------------|--------|--------|--------|--------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% | ---------------|--------|--------|--------|--------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% | ---------------|--------|--------|--------|--------|------------| port,proto | | | | | | 30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% | ---------------|--------|--------|--------|--------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% | ---------------|--------|--------|--------|--------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% | ---------------|--------|--------|--------|--------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% | ---------------'--------'--------'--------'--------'------------' A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
* @ext: nftables API extension pointer, filled with matching reference
*
* For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
*
* This implementation exploits the repetitive characteristic of the algorithm
* to provide a fast, vectorised version using the AVX2 SIMD instruction set.
*
* Return: true on match, false otherwise.
*/
bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext)
{
struct nft_pipapo *priv = nft_set_priv(set);
unsigned long *res, *fill, *scratch;
u8 genmask = nft_genmask_cur(net);
const u8 *rp = (const u8 *)key;
struct nft_pipapo_match *m;
struct nft_pipapo_field *f;
bool map_index;
int i, ret = 0;
netfilter: nft_set_pipapo_avx2: Add irq_fpu_usable() check, fallback to non-AVX2 version Arturo reported this backtrace: [709732.358791] WARNING: CPU: 3 PID: 456 at arch/x86/kernel/fpu/core.c:128 kernel_fpu_begin_mask+0xae/0xe0 [709732.358793] Modules linked in: binfmt_misc nft_nat nft_chain_nat nf_nat nft_counter nft_ct nf_tables nf_conntrack_netlink nfnetlink 8021q garp stp mrp llc vrf intel_rapl_msr intel_rapl_common skx_edac nfit libnvdimm ipmi_ssif x86_pkg_temp_thermal intel_powerclamp coretemp crc32_pclmul mgag200 ghash_clmulni_intel drm_kms_helper cec aesni_intel drm libaes crypto_simd cryptd glue_helper mei_me dell_smbios iTCO_wdt evdev intel_pmc_bxt iTCO_vendor_support dcdbas pcspkr rapl dell_wmi_descriptor wmi_bmof sg i2c_algo_bit watchdog mei acpi_ipmi ipmi_si button nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipmi_devintf ipmi_msghandler ip_tables x_tables autofs4 ext4 crc16 mbcache jbd2 dm_mod raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor sd_mod t10_pi crc_t10dif crct10dif_generic raid6_pq libcrc32c crc32c_generic raid1 raid0 multipath linear md_mod ahci libahci tg3 libata xhci_pci libphy xhci_hcd ptp usbcore crct10dif_pclmul crct10dif_common bnxt_en crc32c_intel scsi_mod [709732.358941] pps_core i2c_i801 lpc_ich i2c_smbus wmi usb_common [709732.358957] CPU: 3 PID: 456 Comm: jbd2/dm-0-8 Not tainted 5.10.0-0.bpo.5-amd64 #1 Debian 5.10.24-1~bpo10+1 [709732.358959] Hardware name: Dell Inc. PowerEdge R440/04JN2K, BIOS 2.9.3 09/23/2020 [709732.358964] RIP: 0010:kernel_fpu_begin_mask+0xae/0xe0 [709732.358969] Code: ae 54 24 04 83 e3 01 75 38 48 8b 44 24 08 65 48 33 04 25 28 00 00 00 75 33 48 83 c4 10 5b c3 65 8a 05 5e 21 5e 76 84 c0 74 92 <0f> 0b eb 8e f0 80 4f 01 40 48 81 c7 00 14 00 00 e8 dd fb ff ff eb [709732.358972] RSP: 0018:ffffbb9700304740 EFLAGS: 00010202 [709732.358976] RAX: 0000000000000001 RBX: 0000000000000003 RCX: 0000000000000001 [709732.358979] RDX: ffffbb9700304970 RSI: ffff922fe1952e00 RDI: 0000000000000003 [709732.358981] RBP: ffffbb9700304970 R08: ffff922fc868a600 R09: ffff922fc711e462 [709732.358984] R10: 000000000000005f R11: ffff922ff0b27180 R12: ffffbb9700304960 [709732.358987] R13: ffffbb9700304b08 R14: ffff922fc664b6c8 R15: ffff922fc664b660 [709732.358990] FS: 0000000000000000(0000) GS:ffff92371fec0000(0000) knlGS:0000000000000000 [709732.358993] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [709732.358996] CR2: 0000557a6655bdd0 CR3: 000000026020a001 CR4: 00000000007706e0 [709732.358999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [709732.359001] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [709732.359003] PKRU: 55555554 [709732.359005] Call Trace: [709732.359009] <IRQ> [709732.359035] nft_pipapo_avx2_lookup+0x4c/0x1cba [nf_tables] [709732.359046] ? sched_clock+0x5/0x10 [709732.359054] ? sched_clock_cpu+0xc/0xb0 [709732.359061] ? record_times+0x16/0x80 [709732.359068] ? plist_add+0xc1/0x100 [709732.359073] ? psi_group_change+0x47/0x230 [709732.359079] ? skb_clone+0x4d/0xb0 [709732.359085] ? enqueue_task_rt+0x22b/0x310 [709732.359098] ? bnxt_start_xmit+0x1e8/0xaf0 [bnxt_en] [709732.359102] ? packet_rcv+0x40/0x4a0 [709732.359121] nft_lookup_eval+0x59/0x160 [nf_tables] [709732.359133] nft_do_chain+0x350/0x500 [nf_tables] [709732.359152] ? nft_lookup_eval+0x59/0x160 [nf_tables] [709732.359163] ? nft_do_chain+0x364/0x500 [nf_tables] [709732.359172] ? fib4_rule_action+0x6d/0x80 [709732.359178] ? fib_rules_lookup+0x107/0x250 [709732.359184] nft_nat_do_chain+0x8a/0xf2 [nft_chain_nat] [709732.359193] nf_nat_inet_fn+0xea/0x210 [nf_nat] [709732.359202] nf_nat_ipv4_out+0x14/0xa0 [nf_nat] [709732.359207] nf_hook_slow+0x44/0xc0 [709732.359214] ip_output+0xd2/0x100 [709732.359221] ? __ip_finish_output+0x210/0x210 [709732.359226] ip_forward+0x37d/0x4a0 [709732.359232] ? ip4_key_hashfn+0xb0/0xb0 [709732.359238] ip_sublist_rcv_finish+0x4f/0x60 [709732.359243] ip_sublist_rcv+0x196/0x220 [709732.359250] ? ip_rcv_finish_core.isra.22+0x400/0x400 [709732.359255] ip_list_rcv+0x137/0x160 [709732.359264] __netif_receive_skb_list_core+0x29b/0x2c0 [709732.359272] netif_receive_skb_list_internal+0x1a6/0x2d0 [709732.359280] gro_normal_list.part.156+0x19/0x40 [709732.359286] napi_complete_done+0x67/0x170 [709732.359298] bnxt_poll+0x105/0x190 [bnxt_en] [709732.359304] ? irqentry_exit+0x29/0x30 [709732.359309] ? asm_common_interrupt+0x1e/0x40 [709732.359315] net_rx_action+0x144/0x3c0 [709732.359322] __do_softirq+0xd5/0x29c [709732.359329] asm_call_irq_on_stack+0xf/0x20 [709732.359332] </IRQ> [709732.359339] do_softirq_own_stack+0x37/0x40 [709732.359346] irq_exit_rcu+0x9d/0xa0 [709732.359353] common_interrupt+0x78/0x130 [709732.359358] asm_common_interrupt+0x1e/0x40 [709732.359366] RIP: 0010:crc_41+0x0/0x1e [crc32c_intel] [709732.359370] Code: ff ff f2 4d 0f 38 f1 93 a8 fe ff ff f2 4c 0f 38 f1 81 b0 fe ff ff f2 4c 0f 38 f1 8a b0 fe ff ff f2 4d 0f 38 f1 93 b0 fe ff ff <f2> 4c 0f 38 f1 81 b8 fe ff ff f2 4c 0f 38 f1 8a b8 fe ff ff f2 4d [709732.359373] RSP: 0018:ffffbb97008dfcd0 EFLAGS: 00000246 [709732.359377] RAX: 000000000000002a RBX: 0000000000000400 RCX: ffff922fc591dd50 [709732.359379] RDX: ffff922fc591dea0 RSI: 0000000000000a14 RDI: ffffffffc00dddc0 [709732.359382] RBP: 0000000000001000 R08: 000000000342d8c3 R09: 0000000000000000 [709732.359384] R10: 0000000000000000 R11: ffff922fc591dff0 R12: ffffbb97008dfe58 [709732.359386] R13: 000000000000000a R14: ffff922fd2b91e80 R15: ffff922fef83fe38 [709732.359395] ? crc_43+0x1e/0x1e [crc32c_intel] [709732.359403] ? crc32c_pcl_intel_update+0x97/0xb0 [crc32c_intel] [709732.359419] ? jbd2_journal_commit_transaction+0xaec/0x1a30 [jbd2] [709732.359425] ? irq_exit_rcu+0x3e/0xa0 [709732.359447] ? kjournald2+0xbd/0x270 [jbd2] [709732.359454] ? finish_wait+0x80/0x80 [709732.359470] ? commit_timeout+0x10/0x10 [jbd2] [709732.359476] ? kthread+0x116/0x130 [709732.359481] ? kthread_park+0x80/0x80 [709732.359488] ? ret_from_fork+0x1f/0x30 [709732.359494] ---[ end trace 081a19978e5f09f5 ]--- that is, nft_pipapo_avx2_lookup() uses the FPU running from a softirq that interrupted a kthread, also using the FPU. That's exactly the reason why irq_fpu_usable() is there: use it, and if we can't use the FPU, fall back to the non-AVX2 version of the lookup operation, i.e. nft_pipapo_lookup(). Reported-by: Arturo Borrero Gonzalez <arturo@netfilter.org> Cc: <stable@vger.kernel.org> # 5.6.x Fixes: 7400b063969b ("nft_set_pipapo: Introduce AVX2-based lookup implementation") Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2021-05-10 05:58:22 +00:00
if (unlikely(!irq_fpu_usable()))
return nft_pipapo_lookup(net, set, key, ext);
nft_set_pipapo: Introduce AVX2-based lookup implementation If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports, averaged over five runs on a single AMD Epyc 7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below. CONFIG_RETPOLINE was not set here. Note that this is not a fair comparison over hash and rbtree set types: non-ranged entries (used to have a reference for hash types) would be matched faster than this, and matching on a single field only (which is the case for rbtree) is also significantly faster. However, it's not possible at the moment to choose this set type for non-ranged entries, and the current implementation also needs a few minor adjustments in order to match on less than two fields. ---------------.-----------------------------------.------------. AMD Epyc 7402 | baselines, Mpps | this patch | 1 thread |___________________________________|____________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | | ---------------| hook | no | single | | pipapo | type entries | drop | ranges | field | pipapo | AVX2 | ---------------|--------|--------|--------|--------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% | ---------------|--------|--------|--------|--------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% | ---------------|--------|--------|--------|--------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% | ---------------|--------|--------|--------|--------|------------| port,proto | | | | | | 30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% | ---------------|--------|--------|--------|--------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% | ---------------|--------|--------|--------|--------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% | ---------------|--------|--------|--------|--------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% | ---------------'--------'--------'--------'--------'------------' A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
m = rcu_dereference(priv->match);
netfilter: nft_set_pipapo_avx2: Skip LDMXCSR, we don't need a valid MXCSR state We don't need a valid MXCSR state for the lookup routines, none of the instructions we use rely on or affect any bit in the MXCSR register. Instead of calling kernel_fpu_begin(), we can pass 0 as mask to kernel_fpu_begin_mask() and spare one LDMXCSR instruction. Commit 49200d17d27d ("x86/fpu/64: Don't FNINIT in kernel_fpu_begin()") already speeds up lookups considerably, and by dropping the MCXSR initialisation we can now get a much smaller, but measurable, increase in matching rates. The table below reports matching rates and a wild approximation of clock cycles needed for a match in a "port,net" test with 10 entries from selftests/netfilter/nft_concat_range.sh, limited to the first field, i.e. the port (with nft_set_rbtree initialisation skipped), run on a single AMD Epyc 7351 thread (2.9GHz, 512 KiB L1D$, 8 MiB L2$). The (very rough) estimation of clock cycles is obtained by simply dividing frequency by matching rate. The "cycles spared" column refers to the difference in cycles compared to the previous row, and the rate increase also refers to the previous row. Results are averages of six runs. Merely for context, I'm also reporting packet rates obtained by skipping kernel_fpu_begin() and kernel_fpu_end() altogether (which shows a very limited impact now), as well as skipping the whole lookup function, compared to simply counting and dropping all packets using the netdev hook drop (see nft_concat_range.sh for details). This workload also includes packet generation with pktgen and the receive path of veth. |matching| est. | cycles | rate | | rate | cycles | spared |increase| | (Mpps) | | | | --------------------------------------|--------|--------|--------|--------| FNINIT, LDMXCSR (before 49200d17d27d) | 5.245 | 553 | - | - | LDMXCSR only (with 49200d17d27d) | 6.347 | 457 | 96 | 21.0% | Without LDMXCSR (this patch) | 6.461 | 449 | 8 | 1.8% | -------- for reference only: ---------|--------|--------|--------|--------| Without kernel_fpu_begin() | 6.513 | 445 | 4 | 0.8% | Without actual matching (return true) | 7.649 | 379 | 66 | 17.4% | Without lookup operation (netdev drop)| 10.320 | 281 | 98 | 34.9% | The clock cycles spared by avoiding LDMXCSR appear to be in line with CPI and latency indicated in the manuals of comparable architectures: Intel Skylake (CPI: 1, latency: 7) and AMD 12h (latency: 12) -- I couldn't find this information for AMD 17h. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2021-05-10 05:58:52 +00:00
/* This also protects access to all data related to scratch maps.
*
* Note that we don't need a valid MXCSR state for any of the
* operations we use here, so pass 0 as mask and spare a LDMXCSR
* instruction.
*/
kernel_fpu_begin_mask(0);
nft_set_pipapo: Introduce AVX2-based lookup implementation If the AVX2 set is available, we can exploit the repetitive characteristic of this algorithm to provide a fast, vectorised version by using 256-bit wide AVX2 operations for bucket loads and bitwise intersections. In most cases, this implementation consistently outperforms rbtree set instances despite the fact they are configured to use a given, single, ranged data type out of the ones used for performance measurements by the nft_concat_range.sh kselftest. That script, injecting packets directly on the ingoing device path with pktgen, reports, averaged over five runs on a single AMD Epyc 7402 thread (3.35GHz, 768 KiB L1D$, 12 MiB L2$), the figures below. CONFIG_RETPOLINE was not set here. Note that this is not a fair comparison over hash and rbtree set types: non-ranged entries (used to have a reference for hash types) would be matched faster than this, and matching on a single field only (which is the case for rbtree) is also significantly faster. However, it's not possible at the moment to choose this set type for non-ranged entries, and the current implementation also needs a few minor adjustments in order to match on less than two fields. ---------------.-----------------------------------.------------. AMD Epyc 7402 | baselines, Mpps | this patch | 1 thread |___________________________________|____________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | | ---------------| hook | no | single | | pipapo | type entries | drop | ranges | field | pipapo | AVX2 | ---------------|--------|--------|--------|--------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 4.0 | 7.5 +87% | ---------------|--------|--------|--------|--------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 6.3 | 8.1 +29% | ---------------|--------|--------|--------|--------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.1 | 4.8 +128% | ---------------|--------|--------|--------|--------|------------| port,proto | | | | | | 30000 | 19.6 | 11.6 | 3.9 | 0.5 | 2.6 +420% | ---------------|--------|--------|--------|--------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 3.4 | 4.7 +38% | ---------------|--------|--------|--------|--------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 1.4 | 3.6 +26% | ---------------|--------|--------|--------|--------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 2.5 | 6.4 +156% | ---------------'--------'--------'--------'--------'------------' A similar strategy could be easily reused to implement specialised versions for other SIMD sets, and I plan to post at least a NEON version at a later time. Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:36 +00:00
scratch = *raw_cpu_ptr(m->scratch_aligned);
if (unlikely(!scratch)) {
kernel_fpu_end();
return false;
}
map_index = raw_cpu_read(nft_pipapo_avx2_scratch_index);
res = scratch + (map_index ? m->bsize_max : 0);
fill = scratch + (map_index ? 0 : m->bsize_max);
/* Starting map doesn't need to be set for this implementation */
nft_pipapo_avx2_prepare();
next_match:
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1, first = !i;
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
ret, rp, \
first, last))
if (likely(f->bb == 8)) {
if (f->groups == 1) {
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
} else if (f->groups == 2) {
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
} else if (f->groups == 4) {
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
} else if (f->groups == 6) {
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
} else if (f->groups == 16) {
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
} else {
ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
ret, rp,
first, last);
}
} else {
if (f->groups == 2) {
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
} else if (f->groups == 4) {
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
} else if (f->groups == 8) {
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
} else if (f->groups == 12) {
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
} else if (f->groups == 32) {
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
} else {
ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
ret, rp,
first, last);
}
}
NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
#undef NFT_SET_PIPAPO_AVX2_LOOKUP
if (ret < 0)
goto out;
if (last) {
*ext = &f->mt[ret].e->ext;
if (unlikely(nft_set_elem_expired(*ext) ||
!nft_set_elem_active(*ext, genmask))) {
ret = 0;
goto next_match;
}
goto out;
}
swap(res, fill);
rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
out:
if (i % 2)
raw_cpu_write(nft_pipapo_avx2_scratch_index, !map_index);
kernel_fpu_end();
return ret >= 0;
}