linux-stable/net/netfilter/nft_set_pipapo.h

291 lines
9.2 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
#ifndef _NFT_SET_PIPAPO_H
#include <linux/log2.h>
#include <net/ipv6.h> /* For the maximum length of a field */
/* Count of concatenated fields depends on count of 32-bit nftables registers */
#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT
nft_set_pipapo: Prepare for single ranged field usage A few adjustments in nft_pipapo_init() are needed to allow usage of this set back-end for a single, ranged field. Provide a convenient NFT_PIPAPO_MIN_FIELDS definition that currently makes sure that the rbtree back-end is selected instead, for sets with a single field. This finally allows a fair comparison with rbtree sets, by defining NFT_PIPAPO_MIN_FIELDS as 0 and skipping rbtree back-end initialisation: ---------------.--------------------------.-------------------------. AMD Epyc 7402 | baselines, Mpps | Mpps, % over rbtree | 1 thread |__________________________|_________________________| 3.35GHz | | | | | | 768KiB L1D$ | netdev | hash | rbtree | | pipapo | ---------------| hook | no | single | pipapo |single field| type entries | drop | ranges | field |single field| AVX2 | ---------------|--------|--------|--------|------------|------------| net,port | | | | | | 1000 | 19.0 | 10.4 | 3.8 | 6.0 +58% | 9.6 +153% | ---------------|--------|--------|--------|------------|------------| port,net | | | | | | 100 | 18.8 | 10.3 | 5.8 | 9.1 +57% |11.6 +100% | ---------------|--------|--------|--------|------------|------------| net6,port | | | | | | 1000 | 16.4 | 7.6 | 1.8 | 2.8 +55% | 6.5 +261% | ---------------|--------|--------|--------|------------|------------| port,proto | | | | [1] | [1] | 30000 | 19.6 | 11.6 | 3.9 | 0.9 -77% | 2.7 -31% | ---------------|--------|--------|--------|------------|------------| port,proto | | | | | | 10000 | 19.6 | 11.6 | 4.4 | 2.1 -52% | 5.6 +27% | ---------------|--------|--------|--------|------------|------------| port,proto | | | | | | 4 threads 10000| 77.9 | 45.1 | 17.4 | 8.3 -52% |22.4 +29% | ---------------|--------|--------|--------|------------|------------| net6,port,mac | | | | | | 10 | 16.5 | 5.4 | 4.3 | 4.5 +5% | 8.2 +91% | ---------------|--------|--------|--------|------------|------------| net6,port,mac, | | | | | | proto 1000 | 16.5 | 5.7 | 1.9 | 2.8 +47% | 6.6 +247% | ---------------|--------|--------|--------|------------|------------| net,mac | | | | | | 1000 | 19.0 | 8.4 | 3.9 | 6.0 +54% | 9.9 +154% | ---------------'--------'--------'--------'------------'------------' [1] Causes switch of lookup table buckets for 'port' to 4-bit groups Signed-off-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-03-07 16:52:37 +00:00
/* Restrict usage to multiple fields, make sure rbtree is used otherwise */
#define NFT_PIPAPO_MIN_FIELDS 2
/* Largest supported field size */
#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr))
#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE)
/* Bits to be grouped together in table buckets depending on set size */
#define NFT_PIPAPO_GROUP_BITS_INIT NFT_PIPAPO_GROUP_BITS_SMALL_SET
#define NFT_PIPAPO_GROUP_BITS_SMALL_SET 8
#define NFT_PIPAPO_GROUP_BITS_LARGE_SET 4
#define NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4 \
BUILD_BUG_ON((NFT_PIPAPO_GROUP_BITS_SMALL_SET != 8) || \
(NFT_PIPAPO_GROUP_BITS_LARGE_SET != 4))
#define NFT_PIPAPO_GROUPS_PER_BYTE(f) (BITS_PER_BYTE / (f)->bb)
/* If a lookup table gets bigger than NFT_PIPAPO_LT_SIZE_HIGH, switch to the
* small group width, and switch to the big group width if the table gets
* smaller than NFT_PIPAPO_LT_SIZE_LOW.
*
* Picking 2MiB as threshold (for a single table) avoids as much as possible
* crossing page boundaries on most architectures (x86-64 and MIPS huge pages,
* ARMv7 supersections, POWER "large" pages, SPARC Level 1 regions, etc.), which
* keeps performance nice in case kvmalloc() gives us non-contiguous areas.
*/
#define NFT_PIPAPO_LT_SIZE_THRESHOLD (1 << 21)
#define NFT_PIPAPO_LT_SIZE_HYSTERESIS (1 << 16)
#define NFT_PIPAPO_LT_SIZE_HIGH NFT_PIPAPO_LT_SIZE_THRESHOLD
#define NFT_PIPAPO_LT_SIZE_LOW NFT_PIPAPO_LT_SIZE_THRESHOLD - \
NFT_PIPAPO_LT_SIZE_HYSTERESIS
/* Fields are padded to 32 bits in input registers */
#define NFT_PIPAPO_GROUPS_PADDED_SIZE(f) \
(round_up((f)->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f), sizeof(u32)))
#define NFT_PIPAPO_GROUPS_PADDING(f) \
(NFT_PIPAPO_GROUPS_PADDED_SIZE(f) - (f)->groups / \
NFT_PIPAPO_GROUPS_PER_BYTE(f))
/* Number of buckets given by 2 ^ n, with n bucket bits */
#define NFT_PIPAPO_BUCKETS(bb) (1 << (bb))
/* Each n-bit range maps to up to n * 2 rules */
#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2))
/* Use the rest of mapping table buckets for rule indices, but it makes no sense
* to exceed 32 bits
*/
#if BITS_PER_LONG == 64
#define NFT_PIPAPO_MAP_TOBITS 32
#else
#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS)
#endif
/* ...which gives us the highest allowed index for a rule */
#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \
- (1UL << NFT_PIPAPO_MAP_NBITS))
/* Definitions for vectorised implementations */
#ifdef NFT_PIPAPO_ALIGN
#define NFT_PIPAPO_ALIGN_HEADROOM \
(NFT_PIPAPO_ALIGN - ARCH_KMALLOC_MINALIGN)
#define NFT_PIPAPO_LT_ALIGN(lt) (PTR_ALIGN((lt), NFT_PIPAPO_ALIGN))
#define NFT_PIPAPO_LT_ASSIGN(field, x) \
do { \
(field)->lt_aligned = NFT_PIPAPO_LT_ALIGN(x); \
(field)->lt = (x); \
} while (0)
#else
#define NFT_PIPAPO_ALIGN_HEADROOM 0
#define NFT_PIPAPO_LT_ALIGN(lt) (lt)
#define NFT_PIPAPO_LT_ASSIGN(field, x) ((field)->lt = (x))
#endif /* NFT_PIPAPO_ALIGN */
#define nft_pipapo_for_each_field(field, index, match) \
for ((field) = (match)->f, (index) = 0; \
(index) < (match)->field_count; \
(index)++, (field)++)
/**
* union nft_pipapo_map_bucket - Bucket of mapping table
* @to: First rule number (in next field) this rule maps to
* @n: Number of rules (in next field) this rule maps to
* @e: If there's no next field, pointer to element this rule maps to
*/
union nft_pipapo_map_bucket {
struct {
#if BITS_PER_LONG == 64
static_assert(NFT_PIPAPO_MAP_TOBITS <= 32);
u32 to;
static_assert(NFT_PIPAPO_MAP_NBITS <= 32);
u32 n;
#else
unsigned long to:NFT_PIPAPO_MAP_TOBITS;
unsigned long n:NFT_PIPAPO_MAP_NBITS;
#endif
};
struct nft_pipapo_elem *e;
};
/**
* struct nft_pipapo_field - Lookup, mapping tables and related data for a field
* @groups: Amount of bit groups
* @rules: Number of inserted rules
* @bsize: Size of each bucket in lookup table, in longs
* @bb: Number of bits grouped together in lookup table buckets
* @lt: Lookup table: 'groups' rows of buckets
* @lt_aligned: Version of @lt aligned to NFT_PIPAPO_ALIGN bytes
* @mt: Mapping table: one bucket per rule
*/
struct nft_pipapo_field {
int groups;
unsigned long rules;
size_t bsize;
int bb;
#ifdef NFT_PIPAPO_ALIGN
unsigned long *lt_aligned;
#endif
unsigned long *lt;
union nft_pipapo_map_bucket *mt;
};
netfilter: nft_set_pipapo: store index in scratch maps Pipapo needs a scratchpad area to keep state during matching. This state can be large and thus cannot reside on stack. Each set preallocates percpu areas for this. On each match stage, one scratchpad half starts with all-zero and the other is inited to all-ones. At the end of each stage, the half that starts with all-ones is always zero. Before next field is tested, pointers to the two halves are swapped, i.e. resmap pointer turns into fill pointer and vice versa. After the last field has been processed, pipapo stashes the index toggle in a percpu variable, with assumption that next packet will start with the all-zero half and sets all bits in the other to 1. This isn't reliable. There can be multiple sets and we can't be sure that the upper and lower half of all set scratch map is always in sync (lookups can be conditional), so one set might have swapped, but other might not have been queried. Thus we need to keep the index per-set-and-cpu, just like the scratchpad. Note that this bug fix is incomplete, there is a related issue. avx2 and normal implementation might use slightly different areas of the map array space due to the avx2 alignment requirements, so m->scratch (generic/fallback implementation) and ->scratch_aligned (avx) may partially overlap. scratch and scratch_aligned are not distinct objects, the latter is just the aligned address of the former. After this change, write to scratch_align->map_index may write to scratch->map, so this issue becomes more prominent, we can set to 1 a bit in the supposedly-all-zero area of scratch->map[]. A followup patch will remove the scratch_aligned and makes generic and avx code use the same (aligned) area. Its done in a separate change to ease review. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2024-02-07 20:52:46 +00:00
/**
* struct nft_pipapo_scratch - percpu data used for lookup and matching
* @map_index: Current working bitmap index, toggled between field matches
* @align_off: Offset to get the originally allocated address
netfilter: nft_set_pipapo: store index in scratch maps Pipapo needs a scratchpad area to keep state during matching. This state can be large and thus cannot reside on stack. Each set preallocates percpu areas for this. On each match stage, one scratchpad half starts with all-zero and the other is inited to all-ones. At the end of each stage, the half that starts with all-ones is always zero. Before next field is tested, pointers to the two halves are swapped, i.e. resmap pointer turns into fill pointer and vice versa. After the last field has been processed, pipapo stashes the index toggle in a percpu variable, with assumption that next packet will start with the all-zero half and sets all bits in the other to 1. This isn't reliable. There can be multiple sets and we can't be sure that the upper and lower half of all set scratch map is always in sync (lookups can be conditional), so one set might have swapped, but other might not have been queried. Thus we need to keep the index per-set-and-cpu, just like the scratchpad. Note that this bug fix is incomplete, there is a related issue. avx2 and normal implementation might use slightly different areas of the map array space due to the avx2 alignment requirements, so m->scratch (generic/fallback implementation) and ->scratch_aligned (avx) may partially overlap. scratch and scratch_aligned are not distinct objects, the latter is just the aligned address of the former. After this change, write to scratch_align->map_index may write to scratch->map, so this issue becomes more prominent, we can set to 1 a bit in the supposedly-all-zero area of scratch->map[]. A followup patch will remove the scratch_aligned and makes generic and avx code use the same (aligned) area. Its done in a separate change to ease review. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2024-02-07 20:52:46 +00:00
* @map: store partial matching results during lookup
*/
struct nft_pipapo_scratch {
u8 map_index;
u32 align_off;
netfilter: nft_set_pipapo: store index in scratch maps Pipapo needs a scratchpad area to keep state during matching. This state can be large and thus cannot reside on stack. Each set preallocates percpu areas for this. On each match stage, one scratchpad half starts with all-zero and the other is inited to all-ones. At the end of each stage, the half that starts with all-ones is always zero. Before next field is tested, pointers to the two halves are swapped, i.e. resmap pointer turns into fill pointer and vice versa. After the last field has been processed, pipapo stashes the index toggle in a percpu variable, with assumption that next packet will start with the all-zero half and sets all bits in the other to 1. This isn't reliable. There can be multiple sets and we can't be sure that the upper and lower half of all set scratch map is always in sync (lookups can be conditional), so one set might have swapped, but other might not have been queried. Thus we need to keep the index per-set-and-cpu, just like the scratchpad. Note that this bug fix is incomplete, there is a related issue. avx2 and normal implementation might use slightly different areas of the map array space due to the avx2 alignment requirements, so m->scratch (generic/fallback implementation) and ->scratch_aligned (avx) may partially overlap. scratch and scratch_aligned are not distinct objects, the latter is just the aligned address of the former. After this change, write to scratch_align->map_index may write to scratch->map, so this issue becomes more prominent, we can set to 1 a bit in the supposedly-all-zero area of scratch->map[]. A followup patch will remove the scratch_aligned and makes generic and avx code use the same (aligned) area. Its done in a separate change to ease review. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2024-02-07 20:52:46 +00:00
unsigned long map[];
};
/**
* struct nft_pipapo_match - Data used for lookup and matching
* @field_count: Amount of fields in set
* @scratch: Preallocated per-CPU maps for partial matching results
* @bsize_max: Maximum lookup table bucket size of all fields, in longs
* @rcu: Matching data is swapped on commits
* @f: Fields, with lookup and mapping tables
*/
struct nft_pipapo_match {
int field_count;
netfilter: nft_set_pipapo: store index in scratch maps Pipapo needs a scratchpad area to keep state during matching. This state can be large and thus cannot reside on stack. Each set preallocates percpu areas for this. On each match stage, one scratchpad half starts with all-zero and the other is inited to all-ones. At the end of each stage, the half that starts with all-ones is always zero. Before next field is tested, pointers to the two halves are swapped, i.e. resmap pointer turns into fill pointer and vice versa. After the last field has been processed, pipapo stashes the index toggle in a percpu variable, with assumption that next packet will start with the all-zero half and sets all bits in the other to 1. This isn't reliable. There can be multiple sets and we can't be sure that the upper and lower half of all set scratch map is always in sync (lookups can be conditional), so one set might have swapped, but other might not have been queried. Thus we need to keep the index per-set-and-cpu, just like the scratchpad. Note that this bug fix is incomplete, there is a related issue. avx2 and normal implementation might use slightly different areas of the map array space due to the avx2 alignment requirements, so m->scratch (generic/fallback implementation) and ->scratch_aligned (avx) may partially overlap. scratch and scratch_aligned are not distinct objects, the latter is just the aligned address of the former. After this change, write to scratch_align->map_index may write to scratch->map, so this issue becomes more prominent, we can set to 1 a bit in the supposedly-all-zero area of scratch->map[]. A followup patch will remove the scratch_aligned and makes generic and avx code use the same (aligned) area. Its done in a separate change to ease review. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reviewed-by: Stefano Brivio <sbrivio@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2024-02-07 20:52:46 +00:00
struct nft_pipapo_scratch * __percpu *scratch;
size_t bsize_max;
struct rcu_head rcu;
struct nft_pipapo_field f[] __counted_by(field_count);
};
/**
* struct nft_pipapo - Representation of a set
* @match: Currently in-use matching data
* @clone: Copy where pending insertions and deletions are kept
* @width: Total bytes to be matched for one packet, including padding
* @dirty: Working copy has pending insertions or deletions
* @last_gc: Timestamp of last garbage collection run, jiffies
*/
struct nft_pipapo {
struct nft_pipapo_match __rcu *match;
struct nft_pipapo_match *clone;
int width;
bool dirty;
unsigned long last_gc;
};
struct nft_pipapo_elem;
/**
* struct nft_pipapo_elem - API-facing representation of single set element
* @priv: element placeholder
* @ext: nftables API extensions
*/
struct nft_pipapo_elem {
struct nft_elem_priv priv;
struct nft_set_ext ext;
};
int pipapo_refill(unsigned long *map, int len, int rules, unsigned long *dst,
union nft_pipapo_map_bucket *mt, bool match_only);
/**
* pipapo_and_field_buckets_4bit() - Intersect 4-bit buckets
* @f: Field including lookup table
* @dst: Area to store result
* @data: Input data selecting table buckets
*/
static inline void pipapo_and_field_buckets_4bit(struct nft_pipapo_field *f,
unsigned long *dst,
const u8 *data)
{
unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
int group;
for (group = 0; group < f->groups; group += BITS_PER_BYTE / 4, data++) {
u8 v;
v = *data >> 4;
__bitmap_and(dst, dst, lt + v * f->bsize,
f->bsize * BITS_PER_LONG);
lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
v = *data & 0x0f;
__bitmap_and(dst, dst, lt + v * f->bsize,
f->bsize * BITS_PER_LONG);
lt += f->bsize * NFT_PIPAPO_BUCKETS(4);
}
}
/**
* pipapo_and_field_buckets_8bit() - Intersect 8-bit buckets
* @f: Field including lookup table
* @dst: Area to store result
* @data: Input data selecting table buckets
*/
static inline void pipapo_and_field_buckets_8bit(struct nft_pipapo_field *f,
unsigned long *dst,
const u8 *data)
{
unsigned long *lt = NFT_PIPAPO_LT_ALIGN(f->lt);
int group;
for (group = 0; group < f->groups; group++, data++) {
__bitmap_and(dst, dst, lt + *data * f->bsize,
f->bsize * BITS_PER_LONG);
lt += f->bsize * NFT_PIPAPO_BUCKETS(8);
}
}
/**
* pipapo_estimate_size() - Estimate worst-case for set size
* @desc: Set description, element count and field description used here
*
* The size for this set type can vary dramatically, as it depends on the number
* of rules (composing netmasks) the entries expand to. We compute the worst
* case here.
*
* In general, for a non-ranged entry or a single composing netmask, we need
* one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that
* is, each input bit needs four bits of matching data), plus a bucket in the
* mapping table for each field.
*
* Return: worst-case set size in bytes, 0 on any overflow
*/
static u64 pipapo_estimate_size(const struct nft_set_desc *desc)
{
unsigned long entry_size;
u64 size;
int i;
for (i = 0, entry_size = 0; i < desc->field_count; i++) {
unsigned long rules;
if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES)
return 0;
/* Worst-case ranges for each concatenated field: each n-bit
* field can expand to up to n * 2 rules in each bucket, and
* each rule also needs a mapping bucket.
*/
rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2;
entry_size += rules *
NFT_PIPAPO_BUCKETS(NFT_PIPAPO_GROUP_BITS_INIT) /
BITS_PER_BYTE;
entry_size += rules * sizeof(union nft_pipapo_map_bucket);
}
/* Rules in lookup and mapping tables are needed for each entry */
size = desc->size * entry_size;
if (size && div_u64(size, desc->size) != entry_size)
return 0;
size += sizeof(struct nft_pipapo) + sizeof(struct nft_pipapo_match) * 2;
size += sizeof(struct nft_pipapo_field) * desc->field_count;
return size;
}
#endif /* _NFT_SET_PIPAPO_H */