Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

1) Protect nft_ct template with global mutex, from Pavel Skripkin.

2) Two recent commits switched inet rt and nexthop exception hashes
   from jhash to siphash. If those two spots are problematic then
   conntrack is affected as well, so switch voer to siphash too.
   While at it, add a hard upper limit on chain lengths and reject
   insertion if this is hit. Patches from Florian Westphal.

3) Fix use-after-scope in nf_socket_ipv6 reported by KASAN,
   from Benjamin Hesmans.

* git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf:
  netfilter: socket: icmp6: fix use-after-scope
  netfilter: refuse insertion if chain has grown too large
  netfilter: conntrack: switch to siphash
  netfilter: conntrack: sanitize table size default settings
  netfilter: nft_ct: protect nft_ct_pcpu_template_refcnt with mutex
====================

Link: https://lore.kernel.org/r/20210903163020.13741-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2021-09-03 16:20:36 -07:00
commit 10905b4a68
10 changed files with 122 additions and 58 deletions

View File

@ -17,9 +17,8 @@ nf_conntrack_acct - BOOLEAN
nf_conntrack_buckets - INTEGER
Size of hash table. If not specified as parameter during module
loading, the default size is calculated by dividing total memory
by 16384 to determine the number of buckets but the hash table will
never have fewer than 32 and limited to 16384 buckets. For systems
with more than 4GB of memory it will be 65536 buckets.
by 16384 to determine the number of buckets. The hash table will
never have fewer than 1024 and never more than 262144 buckets.
This sysctl is only writeable in the initial net namespace.
nf_conntrack_checksum - BOOLEAN
@ -100,8 +99,12 @@ nf_conntrack_log_invalid - INTEGER
Log invalid packets of a type specified by value.
nf_conntrack_max - INTEGER
Size of connection tracking table. Default value is
nf_conntrack_buckets value * 4.
Maximum number of allowed connection tracking entries. This value is set
to nf_conntrack_buckets by default.
Note that connection tracking entries are added to the table twice -- once
for the original direction and once for the reply direction (i.e., with
the reversed address). This means that with default settings a maxed-out
table will have a average hash chain length of 2, not 1.
nf_conntrack_tcp_be_liberal - BOOLEAN
- 0 - disabled (default)

View File

@ -18,6 +18,7 @@ struct ip_conntrack_stat {
unsigned int expect_create;
unsigned int expect_delete;
unsigned int search_restart;
unsigned int chaintoolong;
};
#define NFCT_INFOMASK 7UL

View File

@ -258,6 +258,7 @@ enum ctattr_stats_cpu {
CTA_STATS_ERROR,
CTA_STATS_SEARCH_RESTART,
CTA_STATS_CLASH_RESOLVE,
CTA_STATS_CHAIN_TOOLONG,
__CTA_STATS_MAX,
};
#define CTA_STATS_MAX (__CTA_STATS_MAX - 1)

View File

@ -99,7 +99,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
{
__be16 dport, sport;
const struct in6_addr *daddr = NULL, *saddr = NULL;
struct ipv6hdr *iph = ipv6_hdr(skb);
struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var;
struct sk_buff *data_skb = NULL;
int doff = 0;
int thoff = 0, tproto;
@ -129,8 +129,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
thoff + sizeof(*hp);
} else if (tproto == IPPROTO_ICMPV6) {
struct ipv6hdr ipv6_var;
if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
&sport, &dport, &ipv6_var))
return NULL;

View File

@ -21,7 +21,6 @@
#include <linux/stddef.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/err.h>
#include <linux/percpu.h>
@ -78,6 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all;
#define GC_SCAN_INTERVAL (120u * HZ)
#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10)
#define MAX_CHAINLEN 64u
static struct conntrack_gc_work conntrack_gc_work;
void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
@ -184,25 +185,31 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
unsigned int nf_conntrack_max __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_max);
seqcount_spinlock_t nf_conntrack_generation __read_mostly;
static unsigned int nf_conntrack_hash_rnd __read_mostly;
static siphash_key_t nf_conntrack_hash_rnd __read_mostly;
static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple,
const struct net *net)
{
unsigned int n;
u32 seed;
struct {
struct nf_conntrack_man src;
union nf_inet_addr dst_addr;
u32 net_mix;
u16 dport;
u16 proto;
} __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd));
/* The direction must be ignored, so we hash everything up to the
* destination ports (which is a multiple of 4) and treat the last
* three bytes manually.
*/
seed = nf_conntrack_hash_rnd ^ net_hash_mix(net);
n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32);
return jhash2((u32 *)tuple, n, seed ^
(((__force __u16)tuple->dst.u.all << 16) |
tuple->dst.protonum));
memset(&combined, 0, sizeof(combined));
/* The direction must be ignored, so handle usable members manually. */
combined.src = tuple->src;
combined.dst_addr = tuple->dst.u3;
combined.net_mix = net_hash_mix(net);
combined.dport = (__force __u16)tuple->dst.u.all;
combined.proto = tuple->dst.protonum;
return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd);
}
static u32 scale_hash(u32 hash)
@ -835,7 +842,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
unsigned int chainlen = 0;
unsigned int sequence;
int err = -EEXIST;
zone = nf_ct_zone(ct);
@ -849,15 +858,24 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* See if there's one in the list already, including reverse */
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
if (chainlen++ > MAX_CHAINLEN)
goto chaintoolong;
}
chainlen = 0;
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
if (chainlen++ > MAX_CHAINLEN)
goto chaintoolong;
}
smp_wmb();
/* The caller holds a reference to this object */
@ -867,11 +885,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
NF_CT_STAT_INC(net, insert);
local_bh_enable();
return 0;
chaintoolong:
NF_CT_STAT_INC(net, chaintoolong);
err = -ENOSPC;
out:
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
return -EEXIST;
return err;
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
@ -1084,6 +1104,7 @@ int
__nf_conntrack_confirm(struct sk_buff *skb)
{
const struct nf_conntrack_zone *zone;
unsigned int chainlen = 0, sequence;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@ -1091,7 +1112,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
int ret = NF_DROP;
ct = nf_ct_get(skb, &ctinfo);
@ -1151,15 +1171,28 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
if (chainlen++ > MAX_CHAINLEN)
goto chaintoolong;
}
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
chainlen = 0;
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) {
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
if (chainlen++ > MAX_CHAINLEN) {
chaintoolong:
nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, chaintoolong);
NF_CT_STAT_INC(net, insert_failed);
ret = NF_DROP;
goto dying;
}
}
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
@ -2594,26 +2627,24 @@ int nf_conntrack_init_start(void)
spin_lock_init(&nf_conntrack_locks[i]);
if (!nf_conntrack_htable_size) {
/* Idea from tcp.c: use 1/16384 of memory.
* On i386: 32MB machine has 512 buckets.
* >= 1GB machines have 16384 buckets.
* >= 4GB machines have 65536 buckets.
*/
nf_conntrack_htable_size
= (((nr_pages << PAGE_SHIFT) / 16384)
/ sizeof(struct hlist_head));
if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
nf_conntrack_htable_size = 65536;
if (BITS_PER_LONG >= 64 &&
nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE)))
nf_conntrack_htable_size = 262144;
else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE))
nf_conntrack_htable_size = 16384;
if (nf_conntrack_htable_size < 32)
nf_conntrack_htable_size = 32;
nf_conntrack_htable_size = 65536;
/* Use a max. factor of four by default to get the same max as
* with the old struct list_heads. When a table size is given
* we use the old value of 8 to avoid reducing the max.
* entries. */
max_factor = 4;
if (nf_conntrack_htable_size < 1024)
nf_conntrack_htable_size = 1024;
/* Use a max. factor of one by default to keep the average
* hash chain length at 2 entries. Each entry has to be added
* twice (once for original direction, once for reply).
* When a table size is given we use the old value of 8 to
* avoid implicit reduction of the max entries setting.
*/
max_factor = 1;
}
nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1);

View File

@ -17,7 +17,7 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
static unsigned int nf_ct_expect_hashrnd __read_mostly;
static siphash_key_t nf_ct_expect_hashrnd __read_mostly;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
@ -81,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t)
static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash, seed;
struct {
union nf_inet_addr dst_addr;
u32 net_mix;
u16 dport;
u8 l3num;
u8 protonum;
} __aligned(SIPHASH_ALIGNMENT) combined;
u32 hash;
get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
memset(&combined, 0, sizeof(combined));
hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
(((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
(__force __u16)tuple->dst.u.all) ^ seed);
combined.dst_addr = tuple->dst.u3;
combined.net_mix = net_hash_mix(n);
combined.dport = (__force __u16)tuple->dst.u.all;
combined.l3num = tuple->src.l3num;
combined.protonum = tuple->dst.protonum;
hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}

View File

@ -2528,7 +2528,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
nla_put_be32(skb, CTA_STATS_SEARCH_RESTART,
htonl(st->search_restart)) ||
nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE,
htonl(st->clash_resolve)))
htonl(st->clash_resolve)) ||
nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG,
htonl(st->chaintoolong)))
goto nla_put_failure;
nlmsg_end(skb, nlh);

View File

@ -432,7 +432,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
unsigned int nr_conntracks;
if (v == SEQ_START_TOKEN) {
seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n");
return 0;
}
@ -447,7 +447,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v)
st->invalid,
0,
0,
0,
st->chaintoolong,
st->insert,
st->insert_failed,
st->drop,

View File

@ -13,7 +13,7 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <net/xfrm.h>
#include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/rtnetlink.h>
#include <net/netfilter/nf_conntrack.h>
@ -34,7 +34,7 @@ static unsigned int nat_net_id __read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
static unsigned int nf_nat_hash_rnd __read_mostly;
static siphash_key_t nf_nat_hash_rnd __read_mostly;
struct nf_nat_lookup_hook_priv {
struct nf_hook_entries __rcu *entries;
@ -153,12 +153,22 @@ static unsigned int
hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
struct {
struct nf_conntrack_man src;
u32 net_mix;
u32 protonum;
} __aligned(SIPHASH_ALIGNMENT) combined;
get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
memset(&combined, 0, sizeof(combined));
/* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
combined.src = tuple->src;
combined.net_mix = net_hash_mix(n);
combined.protonum = tuple->dst.protonum;
hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd);
return reciprocal_scale(hash, nf_nat_htable_size);
}

View File

@ -41,6 +41,7 @@ struct nft_ct_helper_obj {
#ifdef CONFIG_NF_CONNTRACK_ZONES
static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template);
static unsigned int nft_ct_pcpu_template_refcnt __read_mostly;
static DEFINE_MUTEX(nft_ct_pcpu_mutex);
#endif
static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
@ -525,8 +526,10 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv)
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
mutex_lock(&nft_ct_pcpu_mutex);
if (--nft_ct_pcpu_template_refcnt == 0)
nft_ct_tmpl_put_pcpu();
mutex_unlock(&nft_ct_pcpu_mutex);
break;
#endif
default:
@ -564,9 +567,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
#endif
#ifdef CONFIG_NF_CONNTRACK_ZONES
case NFT_CT_ZONE:
if (!nft_ct_tmpl_alloc_pcpu())
mutex_lock(&nft_ct_pcpu_mutex);
if (!nft_ct_tmpl_alloc_pcpu()) {
mutex_unlock(&nft_ct_pcpu_mutex);
return -ENOMEM;
}
nft_ct_pcpu_template_refcnt++;
mutex_unlock(&nft_ct_pcpu_mutex);
len = sizeof(u16);
break;
#endif