773 lines
20 KiB
C
773 lines
20 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
|
|
*
|
|
* Development of this code funded by Astaro AG (http://www.astaro.com/)
|
|
*/
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/init.h>
|
|
#include <linux/module.h>
|
|
#include <linux/list.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/netlink.h>
|
|
#include <linux/netfilter.h>
|
|
#include <linux/netfilter/nf_tables.h>
|
|
#include <net/netfilter/nf_tables_core.h>
|
|
|
|
struct nft_rbtree {
|
|
struct rb_root root;
|
|
rwlock_t lock;
|
|
seqcount_rwlock_t count;
|
|
unsigned long last_gc;
|
|
};
|
|
|
|
struct nft_rbtree_elem {
|
|
struct nft_elem_priv priv;
|
|
struct rb_node node;
|
|
struct nft_set_ext ext;
|
|
};
|
|
|
|
static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
|
|
{
|
|
return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
|
|
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
|
|
}
|
|
|
|
static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
|
|
{
|
|
return !nft_rbtree_interval_end(rbe);
|
|
}
|
|
|
|
static int nft_rbtree_cmp(const struct nft_set *set,
|
|
const struct nft_rbtree_elem *e1,
|
|
const struct nft_rbtree_elem *e2)
|
|
{
|
|
return memcmp(nft_set_ext_key(&e1->ext), nft_set_ext_key(&e2->ext),
|
|
set->klen);
|
|
}
|
|
|
|
static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
|
|
{
|
|
return nft_set_elem_expired(&rbe->ext);
|
|
}
|
|
|
|
static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
|
|
const u32 *key, const struct nft_set_ext **ext,
|
|
unsigned int seq)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
const struct nft_rbtree_elem *rbe, *interval = NULL;
|
|
u8 genmask = nft_genmask_cur(net);
|
|
const struct rb_node *parent;
|
|
int d;
|
|
|
|
parent = rcu_dereference_raw(priv->root.rb_node);
|
|
while (parent != NULL) {
|
|
if (read_seqcount_retry(&priv->count, seq))
|
|
return false;
|
|
|
|
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
|
|
|
|
d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
|
|
if (d < 0) {
|
|
parent = rcu_dereference_raw(parent->rb_left);
|
|
if (interval &&
|
|
!nft_rbtree_cmp(set, rbe, interval) &&
|
|
nft_rbtree_interval_end(rbe) &&
|
|
nft_rbtree_interval_start(interval))
|
|
continue;
|
|
interval = rbe;
|
|
} else if (d > 0)
|
|
parent = rcu_dereference_raw(parent->rb_right);
|
|
else {
|
|
if (!nft_set_elem_active(&rbe->ext, genmask)) {
|
|
parent = rcu_dereference_raw(parent->rb_left);
|
|
continue;
|
|
}
|
|
|
|
if (nft_rbtree_elem_expired(rbe))
|
|
return false;
|
|
|
|
if (nft_rbtree_interval_end(rbe)) {
|
|
if (nft_set_is_anonymous(set))
|
|
return false;
|
|
parent = rcu_dereference_raw(parent->rb_left);
|
|
interval = NULL;
|
|
continue;
|
|
}
|
|
|
|
*ext = &rbe->ext;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
|
|
nft_set_elem_active(&interval->ext, genmask) &&
|
|
!nft_rbtree_elem_expired(interval) &&
|
|
nft_rbtree_interval_start(interval)) {
|
|
*ext = &interval->ext;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
INDIRECT_CALLABLE_SCOPE
|
|
bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
|
|
const u32 *key, const struct nft_set_ext **ext)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
unsigned int seq = read_seqcount_begin(&priv->count);
|
|
bool ret;
|
|
|
|
ret = __nft_rbtree_lookup(net, set, key, ext, seq);
|
|
if (ret || !read_seqcount_retry(&priv->count, seq))
|
|
return ret;
|
|
|
|
read_lock_bh(&priv->lock);
|
|
seq = read_seqcount_begin(&priv->count);
|
|
ret = __nft_rbtree_lookup(net, set, key, ext, seq);
|
|
read_unlock_bh(&priv->lock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
|
|
const u32 *key, struct nft_rbtree_elem **elem,
|
|
unsigned int seq, unsigned int flags, u8 genmask)
|
|
{
|
|
struct nft_rbtree_elem *rbe, *interval = NULL;
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
const struct rb_node *parent;
|
|
const void *this;
|
|
int d;
|
|
|
|
parent = rcu_dereference_raw(priv->root.rb_node);
|
|
while (parent != NULL) {
|
|
if (read_seqcount_retry(&priv->count, seq))
|
|
return false;
|
|
|
|
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
|
|
|
|
this = nft_set_ext_key(&rbe->ext);
|
|
d = memcmp(this, key, set->klen);
|
|
if (d < 0) {
|
|
parent = rcu_dereference_raw(parent->rb_left);
|
|
if (!(flags & NFT_SET_ELEM_INTERVAL_END))
|
|
interval = rbe;
|
|
} else if (d > 0) {
|
|
parent = rcu_dereference_raw(parent->rb_right);
|
|
if (flags & NFT_SET_ELEM_INTERVAL_END)
|
|
interval = rbe;
|
|
} else {
|
|
if (!nft_set_elem_active(&rbe->ext, genmask)) {
|
|
parent = rcu_dereference_raw(parent->rb_left);
|
|
continue;
|
|
}
|
|
|
|
if (nft_set_elem_expired(&rbe->ext))
|
|
return false;
|
|
|
|
if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
|
|
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
|
|
(flags & NFT_SET_ELEM_INTERVAL_END)) {
|
|
*elem = rbe;
|
|
return true;
|
|
}
|
|
|
|
if (nft_rbtree_interval_end(rbe))
|
|
interval = NULL;
|
|
|
|
parent = rcu_dereference_raw(parent->rb_left);
|
|
}
|
|
}
|
|
|
|
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
|
|
nft_set_elem_active(&interval->ext, genmask) &&
|
|
!nft_set_elem_expired(&interval->ext) &&
|
|
((!nft_rbtree_interval_end(interval) &&
|
|
!(flags & NFT_SET_ELEM_INTERVAL_END)) ||
|
|
(nft_rbtree_interval_end(interval) &&
|
|
(flags & NFT_SET_ELEM_INTERVAL_END)))) {
|
|
*elem = interval;
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static struct nft_elem_priv *
|
|
nft_rbtree_get(const struct net *net, const struct nft_set *set,
|
|
const struct nft_set_elem *elem, unsigned int flags)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
unsigned int seq = read_seqcount_begin(&priv->count);
|
|
struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
|
|
const u32 *key = (const u32 *)&elem->key.val;
|
|
u8 genmask = nft_genmask_cur(net);
|
|
bool ret;
|
|
|
|
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
|
|
if (ret || !read_seqcount_retry(&priv->count, seq))
|
|
return &rbe->priv;
|
|
|
|
read_lock_bh(&priv->lock);
|
|
seq = read_seqcount_begin(&priv->count);
|
|
ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
|
|
read_unlock_bh(&priv->lock);
|
|
|
|
if (!ret)
|
|
return ERR_PTR(-ENOENT);
|
|
|
|
return &rbe->priv;
|
|
}
|
|
|
|
static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
|
|
struct nft_rbtree *priv,
|
|
struct nft_rbtree_elem *rbe)
|
|
{
|
|
lockdep_assert_held_write(&priv->lock);
|
|
nft_setelem_data_deactivate(net, set, &rbe->priv);
|
|
rb_erase(&rbe->node, &priv->root);
|
|
}
|
|
|
|
static const struct nft_rbtree_elem *
|
|
nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
|
|
struct nft_rbtree_elem *rbe)
|
|
{
|
|
struct nft_set *set = (struct nft_set *)__set;
|
|
struct rb_node *prev = rb_prev(&rbe->node);
|
|
struct net *net = read_pnet(&set->net);
|
|
struct nft_rbtree_elem *rbe_prev;
|
|
struct nft_trans_gc *gc;
|
|
|
|
gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
|
|
if (!gc)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
/* search for end interval coming before this element.
|
|
* end intervals don't carry a timeout extension, they
|
|
* are coupled with the interval start element.
|
|
*/
|
|
while (prev) {
|
|
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
|
|
if (nft_rbtree_interval_end(rbe_prev) &&
|
|
nft_set_elem_active(&rbe_prev->ext, NFT_GENMASK_ANY))
|
|
break;
|
|
|
|
prev = rb_prev(prev);
|
|
}
|
|
|
|
rbe_prev = NULL;
|
|
if (prev) {
|
|
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
|
|
nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
|
|
|
|
/* There is always room in this trans gc for this element,
|
|
* memory allocation never actually happens, hence, the warning
|
|
* splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
|
|
* this is synchronous gc which never fails.
|
|
*/
|
|
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
|
|
if (WARN_ON_ONCE(!gc))
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
nft_trans_gc_elem_add(gc, rbe_prev);
|
|
}
|
|
|
|
nft_rbtree_gc_elem_remove(net, set, priv, rbe);
|
|
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
|
|
if (WARN_ON_ONCE(!gc))
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
nft_trans_gc_elem_add(gc, rbe);
|
|
|
|
nft_trans_gc_queue_sync_done(gc);
|
|
|
|
return rbe_prev;
|
|
}
|
|
|
|
static bool nft_rbtree_update_first(const struct nft_set *set,
|
|
struct nft_rbtree_elem *rbe,
|
|
struct rb_node *first)
|
|
{
|
|
struct nft_rbtree_elem *first_elem;
|
|
|
|
first_elem = rb_entry(first, struct nft_rbtree_elem, node);
|
|
/* this element is closest to where the new element is to be inserted:
|
|
* update the first element for the node list path.
|
|
*/
|
|
if (nft_rbtree_cmp(set, rbe, first_elem) < 0)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
|
struct nft_rbtree_elem *new,
|
|
struct nft_elem_priv **elem_priv)
|
|
{
|
|
struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
|
|
struct rb_node *node, *next, *parent, **p, *first = NULL;
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
u8 cur_genmask = nft_genmask_cur(net);
|
|
u8 genmask = nft_genmask_next(net);
|
|
u64 tstamp = nft_net_tstamp(net);
|
|
int d;
|
|
|
|
/* Descend the tree to search for an existing element greater than the
|
|
* key value to insert that is greater than the new element. This is the
|
|
* first element to walk the ordered elements to find possible overlap.
|
|
*/
|
|
parent = NULL;
|
|
p = &priv->root.rb_node;
|
|
while (*p != NULL) {
|
|
parent = *p;
|
|
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
|
|
d = nft_rbtree_cmp(set, rbe, new);
|
|
|
|
if (d < 0) {
|
|
p = &parent->rb_left;
|
|
} else if (d > 0) {
|
|
if (!first ||
|
|
nft_rbtree_update_first(set, rbe, first))
|
|
first = &rbe->node;
|
|
|
|
p = &parent->rb_right;
|
|
} else {
|
|
if (nft_rbtree_interval_end(rbe))
|
|
p = &parent->rb_left;
|
|
else
|
|
p = &parent->rb_right;
|
|
}
|
|
}
|
|
|
|
if (!first)
|
|
first = rb_first(&priv->root);
|
|
|
|
/* Detect overlap by going through the list of valid tree nodes.
|
|
* Values stored in the tree are in reversed order, starting from
|
|
* highest to lowest value.
|
|
*/
|
|
for (node = first; node != NULL; node = next) {
|
|
next = rb_next(node);
|
|
|
|
rbe = rb_entry(node, struct nft_rbtree_elem, node);
|
|
|
|
if (!nft_set_elem_active(&rbe->ext, genmask))
|
|
continue;
|
|
|
|
/* perform garbage collection to avoid bogus overlap reports
|
|
* but skip new elements in this transaction.
|
|
*/
|
|
if (__nft_set_elem_expired(&rbe->ext, tstamp) &&
|
|
nft_set_elem_active(&rbe->ext, cur_genmask)) {
|
|
const struct nft_rbtree_elem *removed_end;
|
|
|
|
removed_end = nft_rbtree_gc_elem(set, priv, rbe);
|
|
if (IS_ERR(removed_end))
|
|
return PTR_ERR(removed_end);
|
|
|
|
if (removed_end == rbe_le || removed_end == rbe_ge)
|
|
return -EAGAIN;
|
|
|
|
continue;
|
|
}
|
|
|
|
d = nft_rbtree_cmp(set, rbe, new);
|
|
if (d == 0) {
|
|
/* Matching end element: no need to look for an
|
|
* overlapping greater or equal element.
|
|
*/
|
|
if (nft_rbtree_interval_end(rbe)) {
|
|
rbe_le = rbe;
|
|
break;
|
|
}
|
|
|
|
/* first element that is greater or equal to key value. */
|
|
if (!rbe_ge) {
|
|
rbe_ge = rbe;
|
|
continue;
|
|
}
|
|
|
|
/* this is a closer more or equal element, update it. */
|
|
if (nft_rbtree_cmp(set, rbe_ge, new) != 0) {
|
|
rbe_ge = rbe;
|
|
continue;
|
|
}
|
|
|
|
/* element is equal to key value, make sure flags are
|
|
* the same, an existing more or equal start element
|
|
* must not be replaced by more or equal end element.
|
|
*/
|
|
if ((nft_rbtree_interval_start(new) &&
|
|
nft_rbtree_interval_start(rbe_ge)) ||
|
|
(nft_rbtree_interval_end(new) &&
|
|
nft_rbtree_interval_end(rbe_ge))) {
|
|
rbe_ge = rbe;
|
|
continue;
|
|
}
|
|
} else if (d > 0) {
|
|
/* annotate element greater than the new element. */
|
|
rbe_ge = rbe;
|
|
continue;
|
|
} else if (d < 0) {
|
|
/* annotate element less than the new element. */
|
|
rbe_le = rbe;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* - new start element matching existing start element: full overlap
|
|
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
|
|
*/
|
|
if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
|
|
nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
|
|
*elem_priv = &rbe_ge->priv;
|
|
return -EEXIST;
|
|
}
|
|
|
|
/* - new end element matching existing end element: full overlap
|
|
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
|
|
*/
|
|
if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
|
|
nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
|
|
*elem_priv = &rbe_le->priv;
|
|
return -EEXIST;
|
|
}
|
|
|
|
/* - new start element with existing closest, less or equal key value
|
|
* being a start element: partial overlap, reported as -ENOTEMPTY.
|
|
* Anonymous sets allow for two consecutive start element since they
|
|
* are constant, skip them to avoid bogus overlap reports.
|
|
*/
|
|
if (!nft_set_is_anonymous(set) && rbe_le &&
|
|
nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
|
|
return -ENOTEMPTY;
|
|
|
|
/* - new end element with existing closest, less or equal key value
|
|
* being a end element: partial overlap, reported as -ENOTEMPTY.
|
|
*/
|
|
if (rbe_le &&
|
|
nft_rbtree_interval_end(rbe_le) && nft_rbtree_interval_end(new))
|
|
return -ENOTEMPTY;
|
|
|
|
/* - new end element with existing closest, greater or equal key value
|
|
* being an end element: partial overlap, reported as -ENOTEMPTY
|
|
*/
|
|
if (rbe_ge &&
|
|
nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
|
|
return -ENOTEMPTY;
|
|
|
|
/* Accepted element: pick insertion point depending on key value */
|
|
parent = NULL;
|
|
p = &priv->root.rb_node;
|
|
while (*p != NULL) {
|
|
parent = *p;
|
|
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
|
|
d = nft_rbtree_cmp(set, rbe, new);
|
|
|
|
if (d < 0)
|
|
p = &parent->rb_left;
|
|
else if (d > 0)
|
|
p = &parent->rb_right;
|
|
else if (nft_rbtree_interval_end(rbe))
|
|
p = &parent->rb_left;
|
|
else
|
|
p = &parent->rb_right;
|
|
}
|
|
|
|
rb_link_node_rcu(&new->node, parent, p);
|
|
rb_insert_color(&new->node, &priv->root);
|
|
return 0;
|
|
}
|
|
|
|
static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
|
const struct nft_set_elem *elem,
|
|
struct nft_elem_priv **elem_priv)
|
|
{
|
|
struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
int err;
|
|
|
|
do {
|
|
if (fatal_signal_pending(current))
|
|
return -EINTR;
|
|
|
|
cond_resched();
|
|
|
|
write_lock_bh(&priv->lock);
|
|
write_seqcount_begin(&priv->count);
|
|
err = __nft_rbtree_insert(net, set, rbe, elem_priv);
|
|
write_seqcount_end(&priv->count);
|
|
write_unlock_bh(&priv->lock);
|
|
} while (err == -EAGAIN);
|
|
|
|
return err;
|
|
}
|
|
|
|
static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe)
|
|
{
|
|
write_lock_bh(&priv->lock);
|
|
write_seqcount_begin(&priv->count);
|
|
rb_erase(&rbe->node, &priv->root);
|
|
write_seqcount_end(&priv->count);
|
|
write_unlock_bh(&priv->lock);
|
|
}
|
|
|
|
static void nft_rbtree_remove(const struct net *net,
|
|
const struct nft_set *set,
|
|
struct nft_elem_priv *elem_priv)
|
|
{
|
|
struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
|
|
nft_rbtree_erase(priv, rbe);
|
|
}
|
|
|
|
static void nft_rbtree_activate(const struct net *net,
|
|
const struct nft_set *set,
|
|
struct nft_elem_priv *elem_priv)
|
|
{
|
|
struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
|
|
|
|
nft_clear(net, &rbe->ext);
|
|
}
|
|
|
|
static void nft_rbtree_flush(const struct net *net,
|
|
const struct nft_set *set,
|
|
struct nft_elem_priv *elem_priv)
|
|
{
|
|
struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem_priv);
|
|
|
|
nft_set_elem_change_active(net, set, &rbe->ext);
|
|
}
|
|
|
|
static struct nft_elem_priv *
|
|
nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
|
|
const struct nft_set_elem *elem)
|
|
{
|
|
struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
|
|
const struct nft_rbtree *priv = nft_set_priv(set);
|
|
const struct rb_node *parent = priv->root.rb_node;
|
|
u8 genmask = nft_genmask_next(net);
|
|
u64 tstamp = nft_net_tstamp(net);
|
|
int d;
|
|
|
|
while (parent != NULL) {
|
|
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
|
|
|
|
d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
|
|
set->klen);
|
|
if (d < 0)
|
|
parent = parent->rb_left;
|
|
else if (d > 0)
|
|
parent = parent->rb_right;
|
|
else {
|
|
if (nft_rbtree_interval_end(rbe) &&
|
|
nft_rbtree_interval_start(this)) {
|
|
parent = parent->rb_left;
|
|
continue;
|
|
} else if (nft_rbtree_interval_start(rbe) &&
|
|
nft_rbtree_interval_end(this)) {
|
|
parent = parent->rb_right;
|
|
continue;
|
|
} else if (__nft_set_elem_expired(&rbe->ext, tstamp)) {
|
|
break;
|
|
} else if (!nft_set_elem_active(&rbe->ext, genmask)) {
|
|
parent = parent->rb_left;
|
|
continue;
|
|
}
|
|
nft_rbtree_flush(net, set, &rbe->priv);
|
|
return &rbe->priv;
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
static void nft_rbtree_walk(const struct nft_ctx *ctx,
|
|
struct nft_set *set,
|
|
struct nft_set_iter *iter)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
struct nft_rbtree_elem *rbe;
|
|
struct rb_node *node;
|
|
|
|
read_lock_bh(&priv->lock);
|
|
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
|
|
rbe = rb_entry(node, struct nft_rbtree_elem, node);
|
|
|
|
if (iter->count < iter->skip)
|
|
goto cont;
|
|
|
|
iter->err = iter->fn(ctx, set, iter, &rbe->priv);
|
|
if (iter->err < 0) {
|
|
read_unlock_bh(&priv->lock);
|
|
return;
|
|
}
|
|
cont:
|
|
iter->count++;
|
|
}
|
|
read_unlock_bh(&priv->lock);
|
|
}
|
|
|
|
static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
|
|
struct nft_rbtree *priv,
|
|
struct nft_rbtree_elem *rbe)
|
|
{
|
|
nft_setelem_data_deactivate(net, set, &rbe->priv);
|
|
nft_rbtree_erase(priv, rbe);
|
|
}
|
|
|
|
static void nft_rbtree_gc(struct nft_set *set)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
struct nft_rbtree_elem *rbe, *rbe_end = NULL;
|
|
struct net *net = read_pnet(&set->net);
|
|
u64 tstamp = nft_net_tstamp(net);
|
|
struct rb_node *node, *next;
|
|
struct nft_trans_gc *gc;
|
|
|
|
set = nft_set_container_of(priv);
|
|
net = read_pnet(&set->net);
|
|
|
|
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
|
|
if (!gc)
|
|
return;
|
|
|
|
for (node = rb_first(&priv->root); node ; node = next) {
|
|
next = rb_next(node);
|
|
|
|
rbe = rb_entry(node, struct nft_rbtree_elem, node);
|
|
|
|
/* elements are reversed in the rbtree for historical reasons,
|
|
* from highest to lowest value, that is why end element is
|
|
* always visited before the start element.
|
|
*/
|
|
if (nft_rbtree_interval_end(rbe)) {
|
|
rbe_end = rbe;
|
|
continue;
|
|
}
|
|
if (!__nft_set_elem_expired(&rbe->ext, tstamp))
|
|
continue;
|
|
|
|
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
|
|
if (!gc)
|
|
goto try_later;
|
|
|
|
/* end element needs to be removed first, it has
|
|
* no timeout extension.
|
|
*/
|
|
if (rbe_end) {
|
|
nft_rbtree_gc_remove(net, set, priv, rbe_end);
|
|
nft_trans_gc_elem_add(gc, rbe_end);
|
|
rbe_end = NULL;
|
|
}
|
|
|
|
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
|
|
if (!gc)
|
|
goto try_later;
|
|
|
|
nft_rbtree_gc_remove(net, set, priv, rbe);
|
|
nft_trans_gc_elem_add(gc, rbe);
|
|
}
|
|
|
|
try_later:
|
|
|
|
if (gc) {
|
|
gc = nft_trans_gc_catchall_sync(gc);
|
|
nft_trans_gc_queue_sync_done(gc);
|
|
priv->last_gc = jiffies;
|
|
}
|
|
}
|
|
|
|
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
|
|
const struct nft_set_desc *desc)
|
|
{
|
|
return sizeof(struct nft_rbtree);
|
|
}
|
|
|
|
static int nft_rbtree_init(const struct nft_set *set,
|
|
const struct nft_set_desc *desc,
|
|
const struct nlattr * const nla[])
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
|
|
BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
|
|
|
|
rwlock_init(&priv->lock);
|
|
seqcount_rwlock_init(&priv->count, &priv->lock);
|
|
priv->root = RB_ROOT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void nft_rbtree_destroy(const struct nft_ctx *ctx,
|
|
const struct nft_set *set)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
struct nft_rbtree_elem *rbe;
|
|
struct rb_node *node;
|
|
|
|
while ((node = priv->root.rb_node) != NULL) {
|
|
rb_erase(node, &priv->root);
|
|
rbe = rb_entry(node, struct nft_rbtree_elem, node);
|
|
nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
|
|
}
|
|
}
|
|
|
|
static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
|
|
struct nft_set_estimate *est)
|
|
{
|
|
if (desc->field_count > 1)
|
|
return false;
|
|
|
|
if (desc->size)
|
|
est->size = sizeof(struct nft_rbtree) +
|
|
desc->size * sizeof(struct nft_rbtree_elem);
|
|
else
|
|
est->size = ~0;
|
|
|
|
est->lookup = NFT_SET_CLASS_O_LOG_N;
|
|
est->space = NFT_SET_CLASS_O_N;
|
|
|
|
return true;
|
|
}
|
|
|
|
static void nft_rbtree_commit(struct nft_set *set)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
|
|
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
|
|
nft_rbtree_gc(set);
|
|
}
|
|
|
|
static void nft_rbtree_gc_init(const struct nft_set *set)
|
|
{
|
|
struct nft_rbtree *priv = nft_set_priv(set);
|
|
|
|
priv->last_gc = jiffies;
|
|
}
|
|
|
|
const struct nft_set_type nft_set_rbtree_type = {
|
|
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
|
|
.ops = {
|
|
.privsize = nft_rbtree_privsize,
|
|
.elemsize = offsetof(struct nft_rbtree_elem, ext),
|
|
.estimate = nft_rbtree_estimate,
|
|
.init = nft_rbtree_init,
|
|
.destroy = nft_rbtree_destroy,
|
|
.insert = nft_rbtree_insert,
|
|
.remove = nft_rbtree_remove,
|
|
.deactivate = nft_rbtree_deactivate,
|
|
.flush = nft_rbtree_flush,
|
|
.activate = nft_rbtree_activate,
|
|
.commit = nft_rbtree_commit,
|
|
.gc_init = nft_rbtree_gc_init,
|
|
.lookup = nft_rbtree_lookup,
|
|
.walk = nft_rbtree_walk,
|
|
.get = nft_rbtree_get,
|
|
},
|
|
};
|